Mercurial > repos > galaxyp > peptide_genomic_coordinate
changeset 1:cb0378d2d487 draft default tip
"planemo upload commit 43b42fdeef93a498e893fe13f99a076af294e603"
author | galaxyp |
---|---|
date | Sun, 14 Mar 2021 03:01:11 +0000 |
parents | 5f49ffce52cb |
children | |
files | peptide_genomic_coordinate.py peptide_genomic_coordinate.xml test-data/peptides.tabular test-data/peptides_BED.bed test-data/peptides_mapped.bed test-data/test_genomic_mapping_sqlite.sqlite test-data/test_mz_to_sqlite.sqlite |
diffstat | 7 files changed, 199 insertions(+), 152 deletions(-) [+] |
line wrap: on
line diff
--- a/peptide_genomic_coordinate.py Wed Apr 03 04:04:18 2019 -0400 +++ b/peptide_genomic_coordinate.py Sun Mar 14 03:01:11 2021 +0000 @@ -1,154 +1,124 @@ #!/usr/bin/env python -# +# # Author: Praveen Kumar # University of Minnesota # -# Get peptide's genomic coordinate from the protein's genomic mapping sqlite file (which is derived from the https://toolshed.g2.bx.psu.edu/view/galaxyp/translate_bed/038ecf54cbec) -# +# Get peptide's genomic coordinate from the protein's genomic mapping sqlite file +# (which is derived from the https://toolshed.g2.bx.psu.edu/view/galaxyp/translate_bed/038ecf54cbec) +# # python peptideGenomicCoordinate.py <peptide_list> <mz_to_sqlite DB> <genomic mapping file DB> <output.bed> -# +# +import argparse +import sqlite3 import sys -import sqlite3 + + +pep_stmt = """\ +SELECT dBSequence_ref, start, end, peptide_ref \ +FROM peptide_evidence e JOIN peptides p on e.peptide_ref = p.id \ +WHERE isDecoy = 'false' AND p.sequence = ?\ +""" + +map_stmt = """ +SELECT name, chrom, start, end, strand, cds_start, cds_end \ +FROM feature_cds_map \ +WHERE name = ? \ +AND cds_end >= ? AND cds_start <= ? \ +ORDER BY cds_start\ +""" def main(): - conn = sqlite3.connect(sys.argv[2]) - c = conn.cursor() - c.execute("DROP table if exists novel") - conn.commit() - c.execute("CREATE TABLE novel(peptide text)") - pepfile = open(sys.argv[1],"r") - - pep_seq = [] + parser = argparse.ArgumentParser(description='BED file for peptides') + parser.add_argument('peptides_file', + metavar='peptides.tabular', + type=argparse.FileType('r'), + help='List of peptides, one per line') + parser.add_argument('mz_to_sqlite', + metavar='mz.sqlite', + help='mz_to_sqlite sqlite database') + parser.add_argument('genome_mapping', + metavar='genome_mapping.sqlite', + help='genome_mapping sqlite database') + parser.add_argument('bed_file', + metavar='peptides.bed', + type=argparse.FileType('w'), + help='BED file of peptide genomic locations') + parser.add_argument('-a', '--accession', + action='store_true', + help='Append the accession to the peptide for BED name') + parser.add_argument('-d', '--debug', + action='store_true', + help='Debug') + args = parser.parse_args() + + pconn = sqlite3.connect(args.mz_to_sqlite) + pc = pconn.cursor() + mconn = sqlite3.connect(args.genome_mapping) + mc = mconn.cursor() + outfh = args.bed_file + pepfile = args.peptides_file for seq in pepfile.readlines(): seq = seq.strip() - pep_seq.append(tuple([seq])) - - c.executemany("insert into novel(peptide) values(?)", pep_seq) - conn.commit() - - c.execute("SELECT distinct psm.sequence, ps.id, ps.sequence from db_sequence ps, psm_entries psm, novel n, proteins_by_peptide pbp where psm.sequence = n.peptide and pbp.peptide_ref = psm.id and pbp.id = ps.id") - rows = c.fetchall() - - conn1 = sqlite3.connect(sys.argv[3]) - c1 = conn1.cursor() - - outfh = open(sys.argv[4], "w") - - master_dict = {} - for each in rows: - peptide = each[0] - acc = each[1] - acc_seq = each[2] - - c1.execute("SELECT chrom,start,end,name,strand,cds_start,cds_end FROM feature_cds_map map WHERE map.name = '"+acc+"'") - coordinates = c1.fetchall() - - if len(coordinates) != 0: - pep_start = 0 - pep_end = 0 - flag = 0 - splice_flag = 0 - spliced_peptide = [] - for each_entry in coordinates: - chromosome = each_entry[0] - start = int(each_entry[1]) - end = int(each_entry[2]) - strand = each_entry[4] - cds_start = int(each_entry[5]) - cds_end = int(each_entry[6]) - pep_pos_start = (acc_seq.find(peptide)*3) - pep_pos_end = pep_pos_start + (len(peptide)*3) - if pep_pos_start >= cds_start and pep_pos_end <= cds_end: - if strand == "+": - pep_start = start + pep_pos_start - cds_start - pep_end = start + pep_pos_end - cds_start - pep_thick_start = 0 - pep_thick_end = len(peptide) - flag == 1 - else: - pep_end = end - pep_pos_start + cds_start - pep_start = end - pep_pos_end + cds_start - pep_thick_start = 0 - pep_thick_end = len(peptide) - flag == 1 - spliced_peptide = [] - splice_flag = 0 + pc.execute(pep_stmt, (seq,)) + pep_refs = pc.fetchall() + for pep_ref in pep_refs: + (acc, pep_start, pep_end, pep_seq) = pep_ref + cds_start = (pep_start - 1) * 3 + cds_end = pep_end * 3 + if args.debug: + print('%s\t%s\t%s\t%d\t%d' % (acc, pep_start, pep_end, cds_start, cds_end), file=sys.stdout) + mc.execute(map_stmt, (acc, cds_start, cds_end)) + exons = mc.fetchall() + if args.debug: + print('\n'.join([str(e) for e in exons]), file=sys.stdout) + if exons: + chrom = exons[0][1] + strand = exons[0][4] + if strand == '+': + start = exons[0][2] + cds_start + end = exons[-1][2] + cds_end - exons[-1][5] + blk_start = [] + blk_size = [] + for exon in exons: + offset = cds_start if cds_start > exon[5] else 0 + bstart = exon[2] + offset + bsize = min(cds_end, exon[6]) - max(cds_start, exon[5]) + if args.debug: + print('bstart %d\tbsize %d\t %d' % (bstart, bsize, offset), file=sys.stdout) + blk_start.append(bstart - start) + blk_size.append(bsize) else: - if flag == 0: - if strand == "+": - if pep_pos_start >= cds_start and pep_pos_start <= cds_end and pep_pos_end > cds_end: - pep_start = start + pep_pos_start - cds_start - pep_end = end - pep_thick_start = 0 - pep_thick_end = (pep_end-pep_start) - spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end]) - splice_flag = splice_flag + 1 - if splice_flag == 2: - flag = 1 - elif pep_pos_end >= cds_start and pep_pos_end <= cds_end and pep_pos_start < cds_start: - pep_start = start - pep_end = start + pep_pos_end - cds_start - pep_thick_start = (len(peptide)*3)-(pep_end-pep_start) - pep_thick_end = (len(peptide)*3) - spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end]) - splice_flag = splice_flag + 1 - if splice_flag == 2: - flag = 1 - else: - pass - else: - if pep_pos_start >= cds_start and pep_pos_start <= cds_end and pep_pos_end >= cds_end: - pep_start = start - pep_end = end - pep_pos_start - cds_start - pep_thick_start = 0 - pep_thick_end = (pep_end-pep_start) - spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end]) - splice_flag = splice_flag + 1 - if splice_flag == 2: - flag = 1 - elif pep_pos_end >= cds_start and pep_pos_end <= cds_end and pep_pos_start <= cds_start: - pep_start = end - pep_pos_end + cds_start - pep_end = end - pep_thick_start = (len(peptide)*3)-(pep_end-pep_start) - pep_thick_end = (len(peptide)*3) - spliced_peptide.append([pep_start,pep_end,pep_thick_start,pep_thick_end]) - splice_flag = splice_flag + 1 - if splice_flag == 2: - flag = 1 - else: - pass - - if len(spliced_peptide) == 0: - if strand == "+": - bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "1", str(pep_end-pep_start), "0"] - else: - bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "1", str(pep_end-pep_start), "0"] - outfh.write("\t".join(bed_line)+"\n") - else: - if strand == "+": - pep_entry = spliced_peptide - pep_start = min([pep_entry[0][0], pep_entry[1][0]]) - pep_end = max([pep_entry[0][1], pep_entry[1][1]]) - blockSize = [str(min([pep_entry[0][3], pep_entry[1][3]])),str(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]]))] - blockStarts = ["0", str(pep_end-pep_start-(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]])))] - bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "2", ",".join(blockSize), ",".join(blockStarts)] - outfh.write("\t".join(bed_line)+"\n") - else: - pep_entry = spliced_peptide - pep_start = min([pep_entry[0][0], pep_entry[1][0]]) - pep_end = max([pep_entry[0][1], pep_entry[1][1]]) - blockSize = [str(min([pep_entry[0][3], pep_entry[1][3]])),str(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]]))] - blockStarts = ["0", str(pep_end-pep_start-(max([pep_entry[0][3], pep_entry[1][3]])-min([pep_entry[0][3], pep_entry[1][3]])))] - bed_line = [chromosome, str(pep_start), str(pep_end), peptide, "255", strand, str(pep_start), str(pep_end), "0", "2", ",".join(blockSize), ",".join(blockStarts)] - outfh.write("\t".join(bed_line)+"\n") - c.execute("DROP table novel") - conn.commit() - conn.close() - conn1.close() + start = exons[-1][2] + exons[-1][6] - cds_end + end = exons[0][3] - cds_start + exons[0][5] + blk_start = [] + blk_size = [] + for exon in reversed(exons): + bstart = exon[2] + exon[6] - min(exon[6], cds_end) + bsize = min(cds_end, exon[6]) - max(cds_start, exon[5]) + # bend = exon[3] - (exon[5] - max(exon[5], cds_start)) + bend = exon[3] - min(cds_start - exon[5], cds_start) + bend = exon[3] - bsize + if args.debug: + print('bstart %d\tbsize %d\tbend %d' % (bstart, bsize, bend), file=sys.stdout) + blk_start.append(bstart - start) + blk_size.append(bsize) + bed_line = [str(chrom), str(start), str(end), + '_'.join([seq, acc]) if args.accession else seq, + '255', strand, + str(start), str(end), + '0,0,0', + str(len(blk_start)), + ','.join([str(b) for b in blk_size]), + ','.join([str(b) for b in blk_start])] + if args.debug: + print('\t'.join(bed_line), file=sys.stdout) + outfh.write('\t'.join(bed_line) + '\n') + pconn.close() + mconn.close() outfh.close() pepfile.close() - - return None -if __name__ == "__main__": + + +if __name__ == '__main__': main()
--- a/peptide_genomic_coordinate.xml Wed Apr 03 04:04:18 2019 -0400 +++ b/peptide_genomic_coordinate.xml Sun Mar 14 03:01:11 2021 +0000 @@ -1,4 +1,4 @@ -<tool id="peptide_genomic_coordinate" name="Peptide Genomic Coodinate" version="0.1.1"> +<tool id="peptide_genomic_coordinate" name="Peptide Genomic Coordinate" version="1.0.0"> <description>Get Peptide's genomic coordinate using mzsqlite DB and genomic mapping sqlite DB</description> <requirements> <requirement type="package" version="3.7.1">python</requirement> @@ -27,11 +27,7 @@ <param name="peptideinput" value="peptides.tabular"/> <param name="mzsqlite" value="test_mz_to_sqlite.sqlite"/> <param name="mapping" value="test_genomic_mapping_sqlite.sqlite"/> - <output name="peptide_bed"> - <assert_contents> - <has_text text="115176449" /> - </assert_contents> - </output> + <output name="peptide_bed" file="peptides_mapped.bed"/> </test> </tests> <help><![CDATA[
--- a/test-data/peptides.tabular Wed Apr 03 04:04:18 2019 -0400 +++ b/test-data/peptides.tabular Sun Mar 14 03:01:11 2021 +0000 @@ -1,4 +1,73 @@ -AVDPDSSAEASGLR -DGDLENPVLYSGAVK +AAASVGTASQGRRAR +AALAAAEVGPR +AALLPACRLGCPG +AFLGMDPPSHGNTCTR +AGLICVANAR +AGWTGGRGLRGR +APGRASYISTETGLY +AVDPDSSAEASGLRAQDR +AVETSANPERT +AVPAALSFASR +CDCQDCITLVSK +DAGTGWLGLAG +DEEASRAPVAR +DGARFPTPLAQGLR +DLPTTASQVLK +DMPWISGLSQGALEKQR DSGASGSILEASAAR -ELGSSDLTAR +EAFSLFDKDGDGTITTK +EFIFYYLNK +EGLCYSELERQR +ELLIILARCGL +ERILTGEGPAAEHR +ERLDPRGLALAR +ESGYFSGYNLNL +ESSREALVEPTSESPRPALAR +GDPASPRISAAR +GGPGPGGAQGR +GGSPAEGAIR +GGTGKQSAPEMIAEPEK +GLAGQDSSQVPK +GRGGGRGGGGPGGTR +IFPSCFSSRTDDR +IQLTNHMK +ISTNEHRMGPCVK +LFCPIEEDGIYKASVSR +LGGAVGQAVR +LLLPTVPSK +LSVDLTALLDMFQSK +LVIDGKPITIFQERDPTNIK +LVLLAILLPQFSEC +MMIIQIGQK +MPPIPMPPRR +NIMNKPEVNK +NITCNFSITMREK +NPRGAELVEEAK +NSMEFGLEYGL +NVADDTRSEDIR +PGTAAEAGLARPAG +PSSEAQTPSLAGPPCSPR +PVMHLSLDFSAH +QADVEGQVVK +QASEGPIK +RGAAQNIILASTGAAK +RTFQTPSTVR +SHVEPLPLADAS +SLTDPTPQRRCWR +SPAAAWELLPGR +SSGASWLDGK +TAFDEAIAELDTLNEDSYK +TDSHRGAPDEGYTK +TELLSLYSTS +TQMLPTAQ +TTGIPESGRTP +TTLEVDGENNL +TVPQMLYDSALNL +TVSAGMSAR +VCPSLGMLTK +VDTCINGQLIWRQ +VPLLMSGPFVPS +VVDIMAYMASKE +WCSARGEEYLK +WPGESLPSSRMPGR +YGSSYPPLLESLK
--- a/test-data/peptides_BED.bed Wed Apr 03 04:04:18 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ -chr11 115176449 115176491 AVDPDSSAEASGLR 255 + 115176449 115176491 0 1 42 0 -chr5 121445444 121445489 DGDLENPVLYSGAVK 255 - 121445444 121445489 0 1 45 0 -chr17 22866997 22867042 DSGASGSILEASAAR 255 - 22866997 22867042 0 1 45 0 -chr2 91155262 91155292 ELGSSDLTAR 255 - 91155262 91155292 0 1 30 0
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/peptides_mapped.bed Sun Mar 14 03:01:11 2021 +0000 @@ -0,0 +1,16 @@ +4 115141782 115141931 AFLGMDPPSHGNTCTR 255 - 115141782 115141931 0,0,0 2 17,31 0,118 +12 100203760 100203633 EAFSLFDKDGDGTITTK 255 + 100203760 100203633 0,0,0 1 51 0 +17 87435875 87435926 EAFSLFDKDGDGTITTK 255 - 87435875 87435926 0,0,0 1 51 0 +7 16917669 16917720 EAFSLFDKDGDGTITTK 255 - 16917669 16917720 0,0,0 1 51 0 +7 16917669 16917720 EAFSLFDKDGDGTITTK 255 - 16917669 16917720 0,0,0 1 51 0 +12 100203709 100203633 EAFSLFDKDGDGTITTK 255 + 100203709 100203633 0,0,0 1 51 0 +17 87435875 87435926 EAFSLFDKDGDGTITTK 255 - 87435875 87435926 0,0,0 1 51 0 +17 87435875 87435926 EAFSLFDKDGDGTITTK 255 - 87435875 87435926 0,0,0 1 51 0 +12 100203709 100203633 EAFSLFDKDGDGTITTK 255 + 100203709 100203633 0,0,0 1 51 0 +7 66333266 66342783 ERILTGEGPAAEHR 255 - 66333266 66342783 0,0,0 2 20,22 0,9495 +11 102306997 102307134 PSSEAQTPSLAGPPCSPR 255 - 102306997 102307134 0,0,0 2 2,52 0,85 +7 43802079 43802109 RTFQTPSTVR 255 + 43802079 43802109 0,0,0 1 30 0 +7 43799327 43802109 RTFQTPSTVR 255 + 43799327 43802109 0,0,0 2 0,30 0,2752 +5 74718462 74725468 SHVEPLPLADAS 255 + 74718462 74725468 0,0,0 2 28,8 0,6998 +17 12702320 12703386 TTGIPESGRTP 255 - 12702320 12703386 0,0,0 2 7,26 0,1040 +4 150282053 150406206 WCSARGEEYLK 255 + 150282053 150406206 0,0,0 2 22,11 0,124142