| 0 | 1 | 
|  | 2 # | 
|  | 3 # Author: Praveen Kumar | 
|  | 4 # Updated: Nov 8th, 2017 | 
|  | 5 # | 
|  | 6 # | 
|  | 7 # | 
|  | 8 | 
|  | 9 import re | 
|  | 10 | 
|  | 11 | 
|  | 12 def main(): | 
|  | 13     import sys | 
|  | 14     if len(sys.argv) == 4: | 
|  | 15         inputFile = sys.argv | 
|  | 16         infh = open(inputFile[1], "r") | 
|  | 17         # infh = open("Mus_musculus.GRCm38.90.chr.gtf", "r") | 
|  | 18 | 
|  | 19         gtf = {} | 
|  | 20         gtf_transcript = {} | 
|  | 21         gtf_gene = {} | 
|  | 22         for each in infh.readlines(): | 
|  | 23             a = each.split("\t") | 
|  | 24             if re.search("^[^#]", each): | 
|  | 25                 if re.search("gene_biotype \"protein_coding\"", a[8]) and int(a[4].strip()) != int(a[3].strip()): | 
|  | 26                     type = a[2].strip() | 
|  | 27                     if type == "gene" or type == "exon" or type == "CDS" or type == "five_prime_utr" or type == "three_prime_utr": | 
|  | 28                         chr = "chr" + a[0].strip() | 
|  | 29                         strand = a[6].strip() | 
|  | 30                         if strand == "+": | 
|  | 31                             start = a[3].strip() | 
|  | 32                             end = a[4].strip() | 
|  | 33                         elif strand == "-": | 
|  | 34                             if int(a[4].strip()) > int(a[3].strip()): | 
|  | 35                                 start = a[3].strip() | 
|  | 36                                 end = a[4].strip() | 
|  | 37                             elif int(a[4].strip()) < int(a[3].strip()): | 
|  | 38                                 start = a[4].strip() | 
|  | 39                                 end = a[3].strip() | 
|  | 40                             else: | 
|  | 41                                 print "Something fishy in start end coordinates" | 
|  | 42                         else: | 
|  | 43                             print "Something fishy in reading" | 
|  | 44                         if not gtf.has_key(strand): | 
|  | 45                             gtf[strand] = {} | 
|  | 46                         if not gtf[strand].has_key(type): | 
|  | 47                             gtf[strand][type] = [] | 
|  | 48                         b = re.search("gene_id \"(.+?)\";", a[8].strip()) | 
|  | 49                         gene = b.group(1) | 
|  | 50                         if type == "gene": | 
|  | 51                             transcript = "" | 
|  | 52                         else: | 
|  | 53                             b = re.search("transcript_id \"(.+?)\";", a[8].strip()) | 
|  | 54                             transcript = b.group(1) | 
|  | 55                         data = (chr, start, end, gene, transcript, strand, type) | 
|  | 56                         gtf[strand][type].append(data) | 
|  | 57 | 
|  | 58                         if type == "exon": | 
|  | 59                             if gtf_transcript.has_key(chr+"#"+strand): | 
|  | 60                                 if gtf_transcript[chr+"#"+strand].has_key(transcript+"#"+gene): | 
|  | 61                                     gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start)) | 
|  | 62                                     gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end)) | 
|  | 63                                 else: | 
|  | 64                                     gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]] | 
|  | 65                                     gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start)) | 
|  | 66                                     gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end)) | 
|  | 67                             else: | 
|  | 68                                 gtf_transcript[chr+"#"+strand] = {} | 
|  | 69                                 gtf_transcript[chr+"#"+strand][transcript+"#"+gene] = [[],[]] | 
|  | 70                                 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][0].append(int(start)) | 
|  | 71                                 gtf_transcript[chr+"#"+strand][transcript+"#"+gene][1].append(int(end)) | 
|  | 72 | 
|  | 73                         if type == "gene": | 
|  | 74                             if gtf_gene.has_key(chr+"#"+strand): | 
|  | 75                                 gtf_gene[chr+"#"+strand][0].append(int(start)) | 
|  | 76                                 gtf_gene[chr+"#"+strand][1].append(int(end)) | 
|  | 77                                 gtf_gene[chr+"#"+strand][2].append(gene) | 
|  | 78                             else: | 
|  | 79                                 gtf_gene[chr+"#"+strand] = [[0],[0],["no_gene"]] | 
|  | 80                                 gtf_gene[chr+"#"+strand][0].append(int(start)) | 
|  | 81                                 gtf_gene[chr+"#"+strand][1].append(int(end)) | 
|  | 82                                 gtf_gene[chr+"#"+strand][2].append(gene) | 
|  | 83 | 
|  | 84 | 
|  | 85 | 
|  | 86         # "Starting Reading Intron . . ." | 
|  | 87 | 
|  | 88         gtf["+"]["intron"] = [] | 
|  | 89         gtf["-"]["intron"] = [] | 
|  | 90         for chr_strand  in gtf_transcript.keys(): | 
|  | 91             chr = chr_strand.split("#")[0] | 
|  | 92             strand = chr_strand.split("#")[1] | 
|  | 93 | 
|  | 94             for transcript_gene in gtf_transcript[chr_strand].keys(): | 
|  | 95                 start_list = gtf_transcript[chr_strand][transcript_gene][0] | 
|  | 96                 end_list = gtf_transcript[chr_strand][transcript_gene][1] | 
|  | 97                 sorted_start_index = [i[0] for i in sorted(enumerate(start_list), key=lambda x:x[1])] | 
|  | 98                 sorted_end_index = [i[0] for i in sorted(enumerate(end_list), key=lambda x:x[1])] | 
|  | 99                 if sorted_start_index == sorted_end_index: | 
|  | 100                     sorted_start = sorted(start_list) | 
|  | 101                     sorted_end = [end_list[i] for i in sorted_start_index] | 
|  | 102                     for x in range(len(sorted_start))[1:]: | 
|  | 103                         intron_start = sorted_end[x-1]+1 | 
|  | 104                         intron_end = sorted_start[x]-1 | 
|  | 105                         transcript = transcript_gene.split("#")[0] | 
|  | 106                         gene = transcript_gene.split("#")[1] | 
|  | 107                         data = (chr, str(intron_start), str(intron_end), gene, transcript, strand, "intron") | 
|  | 108                         gtf[strand]["intron"].append(data) | 
|  | 109 | 
|  | 110 | 
|  | 111         # "Starting Reading Intergenic . . ." | 
|  | 112 | 
|  | 113         gtf["+"]["intergenic"] = [] | 
|  | 114         gtf["-"]["intergenic"] = [] | 
|  | 115         for chr_strand  in gtf_gene.keys(): | 
|  | 116             chr = chr_strand.split("#")[0] | 
|  | 117             strand = chr_strand.split("#")[1] | 
|  | 118             start_list = gtf_gene[chr_strand][0] | 
|  | 119             end_list = gtf_gene[chr_strand][1] | 
|  | 120             gene_list = gtf_gene[chr_strand][2] | 
|  | 121             sorted_start_index = [i[0] for i in sorted(enumerate(start_list), key=lambda x:x[1])] | 
|  | 122             sorted_end_index = [i[0] for i in sorted(enumerate(end_list), key=lambda x:x[1])] | 
|  | 123 | 
|  | 124             sorted_start = sorted(start_list) | 
|  | 125             sorted_end = [end_list[i] for i in sorted_start_index] | 
|  | 126             sorted_gene = [gene_list[i] for i in sorted_start_index] | 
|  | 127             for x in range(len(sorted_start))[1:]: | 
|  | 128                 intergene_start = sorted_end[x-1]+1 | 
|  | 129                 intergene_end = sorted_start[x]-1 | 
|  | 130                 if intergene_start < intergene_end: | 
|  | 131                     intergene_1 = sorted_gene[x-1] | 
|  | 132                     intergene_2 = sorted_gene[x] | 
|  | 133                     gene = intergene_1 + "-#-" + intergene_2 | 
|  | 134                     data = (chr, str(intergene_start), str(intergene_end), gene, "", strand, "intergenic") | 
|  | 135                     gtf[strand]["intergenic"].append(data) | 
|  | 136 | 
|  | 137         import sqlite3 | 
|  | 138         # conn = sqlite3.connect('gtf_database.db') | 
|  | 139         conn = sqlite3.connect(":memory:") | 
|  | 140         c = conn.cursor() | 
|  | 141         # c.execute("DROP TABLE IF EXISTS gtf_data;") | 
|  | 142         # c.execute("CREATE TABLE IF NOT EXISTS gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)") | 
|  | 143         c.execute("CREATE TABLE gtf_data(chr text, start int, end int, gene text, transcript text, strand text, type text)") | 
|  | 144 | 
|  | 145         for strand in gtf.keys(): | 
|  | 146             if strand == "+": | 
|  | 147                 st = "positive" | 
|  | 148             elif strand == "-": | 
|  | 149                 st = "negative" | 
|  | 150             else: | 
|  | 151                 print "Something fishy in writing . . ." | 
|  | 152 | 
|  | 153             for type in gtf[strand].keys(): | 
|  | 154                 data = gtf[strand][type] | 
|  | 155                 c.executemany('INSERT INTO gtf_data VALUES (?,?,?,?,?,?,?)', data) | 
|  | 156 | 
|  | 157         conn.commit() | 
|  | 158 | 
|  | 159         infh = open(inputFile[2], "r") | 
|  | 160         # infh = open("Mouse_Data_All_peptides_withNewDBs.txt", "r") | 
|  | 161         data = infh.readlines() | 
|  | 162         # output file | 
|  | 163         outfh = open(inputFile[3], 'w') | 
|  | 164         # outfh = open("classified_1_Mouse_Data_All_peptides_withNewDBs.txt", "w") | 
|  | 165 | 
|  | 166         for each in data: | 
|  | 167             a = each.split("\t") | 
|  | 168             chr = a[0].strip() | 
|  | 169             pep_start = a[1].strip() | 
|  | 170             pep_end = a[2].strip() | 
|  | 171             strand = a[5].strip() | 
|  | 172             c.execute("select * from gtf_data where type = 'CDS' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | 
|  | 173             rows = c.fetchall() | 
|  | 174             if len(rows) > 0: | 
|  | 175                 outfh.write(each.strip() + "\tCDS\n") | 
|  | 176             else: | 
|  | 177                 c.execute("select * from gtf_data where type = 'five_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | 
|  | 178                 rows = c.fetchall() | 
|  | 179                 if len(rows) > 0: | 
|  | 180                     outfh.write(each.strip() + "\tfive_prime_utr\n") | 
|  | 181                 else: | 
|  | 182                     c.execute("select * from gtf_data where type = 'three_prime_utr' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | 
|  | 183                     rows = c.fetchall() | 
|  | 184                     if len(rows) > 0: | 
|  | 185                         outfh.write(each.strip() + "\tthree_prime_utr\n") | 
|  | 186                     else: | 
|  | 187                         c.execute("select * from gtf_data where type = 'exon' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | 
|  | 188                         rows = c.fetchall() | 
|  | 189                         if len(rows) > 0: | 
|  | 190                             outfh.write(each.strip() + "\texon\n") | 
|  | 191                         else: | 
|  | 192                             c.execute("select * from gtf_data where type = 'intron' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | 
|  | 193                             rows = c.fetchall() | 
|  | 194                             if len(rows) > 0: | 
|  | 195                                 outfh.write(each.strip() + "\tintron\n") | 
|  | 196                             else: | 
|  | 197                                 c.execute("select * from gtf_data where type = 'gene' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | 
|  | 198                                 rows = c.fetchall() | 
|  | 199                                 if len(rows) > 0: | 
|  | 200                                     outfh.write(each.strip() + "\tgene\n") | 
|  | 201                                 else: | 
|  | 202                                     c.execute("select * from gtf_data where type = 'intergenic' and chr = '"+chr+"' and start <= "+pep_start+" and end >= "+pep_end+" and strand = '"+strand+"' ") | 
|  | 203                                     rows = c.fetchall() | 
|  | 204                                     if len(rows) > 0: | 
|  | 205                                         outfh.write(each.strip() + "\tintergene\n") | 
|  | 206                                     else: | 
|  | 207                                         outfh.write(each.strip() + "\tOVERLAPPING_ON_TWO_REGIONS: PLEASE_LOOK_MANUALLY (Will be updated in next version)\n") | 
|  | 208 | 
|  | 209         conn.close() | 
|  | 210         outfh.close() | 
|  | 211     else: | 
|  | 212         print "USAGE: python pep_pointer.py <input GTF file> <input tblastn file> <name of output file>" | 
|  | 213     return None | 
|  | 214 | 
|  | 215 if __name__ == "__main__": | 
|  | 216     main() | 
|  | 217 | 
|  | 218 | 
|  | 219 | 
|  | 220 | 
|  | 221 |