Mercurial > repos > vipints > fml_gff3togtf
annotate GFFParser.py @ 11:5c6f33e20fcc default tip
requirement tag added
author | vipints <vipin@cbio.mskcc.org> |
---|---|
date | Fri, 24 Apr 2015 18:04:27 -0400 |
parents | c42c69aa81f8 |
children |
rev | line source |
---|---|
10
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
1 #!/usr/bin/env python |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
2 """ |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
3 Extract genome annotation from a GFF (a tab delimited format for storing sequence features and annotations) file. |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
4 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
5 Requirements: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
6 Numpy :- http://numpy.org/ |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
7 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
8 Copyright (C) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
9 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
10 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
11 2012-2015 Memorial Sloan Kettering Cancer Center, New York City, USA. |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
12 """ |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
13 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
14 import re |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
15 import os |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
16 import sys |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
17 import urllib |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
18 import numpy as np |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
19 import helper as utils |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
20 from collections import defaultdict |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
21 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
22 def attribute_tags(col9): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
23 """ |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
24 Split the key-value tags from the attribute column, it takes column number 9 from GTF/GFF file |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
25 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
26 @args col9: attribute column from GFF file |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
27 @type col9: str |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
28 """ |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
29 info = defaultdict(list) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
30 is_gff = False |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
31 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
32 if not col9: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
33 return is_gff, info |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
34 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
35 # trim the line ending semi-colon ucsc may have some white-space |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
36 col9 = col9.rstrip(';| ') |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
37 # attributes from 9th column |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
38 atbs = col9.split(" ; ") |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
39 if len(atbs) == 1: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
40 atbs = col9.split("; ") |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
41 if len(atbs) == 1: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
42 atbs = col9.split(";") |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
43 # check the GFF3 pattern which has key value pairs like: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
44 gff3_pat = re.compile("\w+=") |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
45 # sometime GTF have: gene_id uc002zkg.1; |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
46 gtf_pat = re.compile("\s?\w+\s") |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
47 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
48 key_vals = [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
49 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
50 if gff3_pat.match(atbs[0]): # gff3 pattern |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
51 is_gff = True |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
52 key_vals = [at.split('=') for at in atbs] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
53 elif gtf_pat.match(atbs[0]): # gtf pattern |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
54 for at in atbs: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
55 key_vals.append(at.strip().split(" ",1)) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
56 else: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
57 # to handle attribute column has only single value |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
58 key_vals.append(['ID', atbs[0]]) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
59 # get key, val items |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
60 for item in key_vals: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
61 key, val = item |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
62 # replace the double qoutes from feature identifier |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
63 val = re.sub('"', '', val) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
64 # replace the web formating place holders to plain text format |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
65 info[key].extend([urllib.unquote(v) for v in val.split(',') if v]) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
66 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
67 return is_gff, info |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
68 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
69 def spec_features_keywd(gff_parts): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
70 """ |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
71 Specify the feature key word according to the GFF specifications |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
72 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
73 @args gff_parts: attribute field key |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
74 @type gff_parts: str |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
75 """ |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
76 for t_id in ["transcript_id", "transcriptId", "proteinId"]: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
77 try: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
78 gff_parts["info"]["Parent"] = gff_parts["info"][t_id] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
79 break |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
80 except KeyError: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
81 pass |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
82 for g_id in ["gene_id", "geneid", "geneId", "name", "gene_name", "genename"]: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
83 try: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
84 gff_parts["info"]["GParent"] = gff_parts["info"][g_id] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
85 break |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
86 except KeyError: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
87 pass |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
88 ## TODO key words |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
89 for flat_name in ["Transcript", "CDS"]: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
90 if gff_parts["info"].has_key(flat_name): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
91 # parents |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
92 if gff_parts['type'] in [flat_name] or re.search(r'transcript', gff_parts['type'], re.IGNORECASE): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
93 if not gff_parts['id']: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
94 gff_parts['id'] = gff_parts['info'][flat_name][0] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
95 #gff_parts["info"]["ID"] = [gff_parts["id"]] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
96 # children |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
97 elif gff_parts["type"] in ["intron", "exon", "three_prime_UTR", |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
98 "coding_exon", "five_prime_UTR", "CDS", "stop_codon", |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
99 "start_codon"]: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
100 gff_parts["info"]["Parent"] = gff_parts["info"][flat_name] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
101 break |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
102 return gff_parts |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
103 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
104 def Parse(ga_file): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
105 """ |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
106 Parsing GFF/GTF file based on feature relationship, it takes the input file. |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
107 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
108 @args ga_file: input file name |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
109 @type ga_file: str |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
110 """ |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
111 child_map = defaultdict(list) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
112 parent_map = dict() |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
113 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
114 ga_handle = utils.open_file(ga_file) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
115 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
116 for rec in ga_handle: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
117 rec = rec.strip('\n\r') |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
118 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
119 # skip empty line fasta identifier and commented line |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
120 if not rec or rec[0] in ['#', '>']: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
121 continue |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
122 # skip the genome sequence |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
123 if not re.search('\t', rec): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
124 continue |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
125 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
126 parts = rec.split('\t') |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
127 assert len(parts) >= 8, rec |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
128 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
129 # process the attribute column (9th column) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
130 ftype, tags = attribute_tags(parts[-1]) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
131 if not tags: # skip the line if no attribute column. |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
132 continue |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
133 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
134 # extract fields |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
135 if parts[1]: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
136 tags["source"] = parts[1] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
137 if parts[7]: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
138 tags["phase"] = parts[7] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
139 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
140 gff_info = dict() |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
141 gff_info['info'] = dict(tags) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
142 gff_info["is_gff3"] = ftype |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
143 gff_info['chr'] = parts[0] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
144 gff_info['score'] = parts[5] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
145 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
146 if parts[3] and parts[4]: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
147 gff_info['location'] = [int(parts[3]) , |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
148 int(parts[4])] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
149 gff_info['type'] = parts[2] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
150 gff_info['id'] = tags.get('ID', [''])[0] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
151 if parts[6] in ['?', '.']: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
152 parts[6] = None |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
153 gff_info['strand'] = parts[6] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
154 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
155 # key word according to the GFF spec. |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
156 # is_gff3 flag is false check this condition and get the attribute fields |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
157 if not ftype: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
158 gff_info = spec_features_keywd(gff_info) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
159 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
160 # link the feature relationships |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
161 if gff_info['info'].has_key('Parent'): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
162 for p in gff_info['info']['Parent']: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
163 if p == gff_info['id']: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
164 gff_info['id'] = '' |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
165 break |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
166 rec_category = 'child' |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
167 elif gff_info['id']: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
168 rec_category = 'parent' |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
169 else: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
170 rec_category = 'record' |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
171 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
172 # depends on the record category organize the features |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
173 if rec_category == 'child': |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
174 for p in gff_info['info']['Parent']: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
175 # create the data structure based on source and feature id |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
176 child_map[(gff_info['chr'], gff_info['info']['source'], p)].append( |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
177 dict( type = gff_info['type'], |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
178 location = gff_info['location'], |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
179 strand = gff_info['strand'], |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
180 score = gff_info['score'], |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
181 ID = gff_info['id'], |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
182 gene_id = gff_info['info'].get('GParent', '') |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
183 )) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
184 elif rec_category == 'parent': |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
185 parent_map[(gff_info['chr'], gff_info['info']['source'], gff_info['id'])] = dict( |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
186 type = gff_info['type'], |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
187 location = gff_info['location'], |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
188 strand = gff_info['strand'], |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
189 score = gff_info['score'], |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
190 name = tags.get('Name', [''])[0]) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
191 elif rec_category == 'record': |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
192 #TODO how to handle plain records? |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
193 c = 1 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
194 ga_handle.close() |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
195 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
196 # depends on file type create parent feature |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
197 if not ftype: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
198 parent_map, child_map = create_missing_feature_type(parent_map, child_map) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
199 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
200 # connecting parent child relations |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
201 # essentially the parent child features are here from any type of GTF/GFF2/GFF3 file |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
202 gene_mat = format_gene_models(parent_map, child_map) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
203 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
204 return gene_mat |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
205 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
206 def format_gene_models(parent_nf_map, child_nf_map): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
207 """ |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
208 Genarate GeneObject based on the parsed file contents |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
209 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
210 @args parent_nf_map: parent features with source and chromosome information |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
211 @type parent_nf_map: collections defaultdict |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
212 @args child_nf_map: transctipt and exon information are encoded |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
213 @type child_nf_map: collections defaultdict |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
214 """ |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
215 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
216 g_cnt = 0 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
217 gene = np.zeros((len(parent_nf_map),), dtype = utils.init_gene()) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
218 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
219 for pkey, pdet in parent_nf_map.items(): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
220 # considering only gene features |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
221 #if not re.search(r'gene', pdet.get('type', '')): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
222 # continue |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
223 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
224 # infer the gene start and stop if not there in the |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
225 if not pdet.get('location', []): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
226 GNS, GNE = [], [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
227 # multiple number of transcripts |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
228 for L1 in child_nf_map[pkey]: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
229 GNS.append(L1.get('location', [])[0]) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
230 GNE.append(L1.get('location', [])[1]) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
231 GNS.sort() |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
232 GNE.sort() |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
233 pdet['location'] = [GNS[0], GNE[-1]] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
234 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
235 orient = pdet.get('strand', '') |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
236 gene[g_cnt]['id'] = g_cnt +1 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
237 gene[g_cnt]['chr'] = pkey[0] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
238 gene[g_cnt]['source'] = pkey[1] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
239 gene[g_cnt]['name'] = pkey[-1] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
240 gene[g_cnt]['start'] = pdet.get('location', [])[0] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
241 gene[g_cnt]['stop'] = pdet.get('location', [])[1] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
242 gene[g_cnt]['strand'] = orient |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
243 gene[g_cnt]['score'] = pdet.get('score','') |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
244 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
245 # default value |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
246 gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 0 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
247 if len(child_nf_map[pkey]) > 1: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
248 gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 1 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
249 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
250 # complete sub-feature for all transcripts |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
251 dim = len(child_nf_map[pkey]) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
252 TRS = np.zeros((dim,), dtype=np.object) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
253 TR_TYP = np.zeros((dim,), dtype=np.object) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
254 EXON = np.zeros((dim,), dtype=np.object) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
255 UTR5 = np.zeros((dim,), dtype=np.object) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
256 UTR3 = np.zeros((dim,), dtype=np.object) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
257 CDS = np.zeros((dim,), dtype=np.object) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
258 TISc = np.zeros((dim,), dtype=np.object) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
259 TSSc = np.zeros((dim,), dtype=np.object) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
260 CLV = np.zeros((dim,), dtype=np.object) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
261 CSTOP = np.zeros((dim,), dtype=np.object) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
262 TSTAT = np.zeros((dim,), dtype=np.object) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
263 TSCORE = np.zeros((dim,), dtype=np.object) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
264 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
265 # fetching corresponding transcripts |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
266 for xq, Lv1 in enumerate(child_nf_map[pkey]): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
267 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
268 TID = Lv1.get('ID', '') |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
269 TRS[xq]= np.array([TID]) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
270 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
271 TYPE = Lv1.get('type', '') |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
272 TR_TYP[xq] = np.array('') |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
273 TR_TYP[xq] = np.array(TYPE) if TYPE else TR_TYP[xq] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
274 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
275 orient = Lv1.get('strand', '') |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
276 tr_score = Lv1.get('score', '') |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
277 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
278 # fetching different sub-features |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
279 child_feat = defaultdict(list) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
280 for Lv2 in child_nf_map[(pkey[0], pkey[1], TID)]: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
281 E_TYP = Lv2.get('type', '') |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
282 child_feat[E_TYP].append(Lv2.get('location')) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
283 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
284 # make general ascending order of coordinates |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
285 if orient == '-': |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
286 for etype, excod in child_feat.items(): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
287 if len(excod) > 1: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
288 if excod[0][0] > excod[-1][0]: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
289 excod.reverse() |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
290 child_feat[etype] = excod |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
291 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
292 # make exon coordinate from cds and utr regions |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
293 if not child_feat.get('exon'): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
294 if child_feat.get('CDS'): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
295 exon_cod = utils.make_Exon_cod( orient, |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
296 NonetoemptyList(child_feat.get('five_prime_UTR')), |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
297 NonetoemptyList(child_feat.get('CDS')), |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
298 NonetoemptyList(child_feat.get('three_prime_UTR'))) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
299 child_feat['exon'] = exon_cod |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
300 else: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
301 # TODO only UTR's |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
302 # searching through keys to find a pattern describing exon feature |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
303 ex_key_pattern = [k for k in child_feat if k.endswith("exon")] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
304 if ex_key_pattern: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
305 child_feat['exon'] = child_feat[ex_key_pattern[0]] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
306 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
307 # stop_codon are seperated from CDS, add the coordinates based on strand |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
308 if child_feat.get('stop_codon'): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
309 if orient == '+': |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
310 if child_feat.get('stop_codon')[0][0] - child_feat.get('CDS')[-1][1] == 1: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
311 child_feat['CDS'][-1] = [child_feat.get('CDS')[-1][0], child_feat.get('stop_codon')[0][1]] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
312 else: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
313 child_feat['CDS'].append(child_feat.get('stop_codon')[0]) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
314 elif orient == '-': |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
315 if child_feat.get('CDS')[0][0] - child_feat.get('stop_codon')[0][1] == 1: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
316 child_feat['CDS'][0] = [child_feat.get('stop_codon')[0][0], child_feat.get('CDS')[0][1]] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
317 else: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
318 child_feat['CDS'].insert(0, child_feat.get('stop_codon')[0]) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
319 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
320 # transcript signal sites |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
321 TIS, cdsStop, TSS, cleave = [], [], [], [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
322 cds_status, exon_status, utr_status = 0, 0, 0 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
323 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
324 if child_feat.get('exon'): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
325 TSS = [child_feat.get('exon')[-1][1]] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
326 TSS = [child_feat.get('exon')[0][0]] if orient == '+' else TSS |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
327 cleave = [child_feat.get('exon')[0][0]] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
328 cleave = [child_feat.get('exon')[-1][1]] if orient == '+' else cleave |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
329 exon_status = 1 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
330 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
331 if child_feat.get('CDS'): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
332 if orient == '+': |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
333 TIS = [child_feat.get('CDS')[0][0]] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
334 cdsStop = [child_feat.get('CDS')[-1][1]-3] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
335 else: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
336 TIS = [child_feat.get('CDS')[-1][1]] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
337 cdsStop = [child_feat.get('CDS')[0][0]+3] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
338 cds_status = 1 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
339 # cds phase calculation |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
340 child_feat['CDS'] = utils.add_CDS_phase(orient, child_feat.get('CDS')) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
341 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
342 # sub-feature status |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
343 if child_feat.get('three_prime_UTR') or child_feat.get('five_prime_UTR'): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
344 utr_status =1 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
345 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
346 if utr_status == cds_status == exon_status == 1: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
347 t_status = 1 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
348 else: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
349 t_status = 0 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
350 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
351 # add sub-feature # make array for export to different out |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
352 TSTAT[xq] = t_status |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
353 EXON[xq] = np.array(child_feat.get('exon'), np.float64) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
354 UTR5[xq] = np.array(NonetoemptyList(child_feat.get('five_prime_UTR'))) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
355 UTR3[xq] = np.array(NonetoemptyList(child_feat.get('three_prime_UTR'))) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
356 CDS[xq] = np.array(NonetoemptyList(child_feat.get('CDS'))) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
357 TISc[xq] = np.array(TIS) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
358 CSTOP[xq] = np.array(cdsStop) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
359 TSSc[xq] = np.array(TSS) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
360 CLV[xq] = np.array(cleave) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
361 TSCORE[xq] = tr_score |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
362 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
363 # add sub-features to the parent gene feature |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
364 gene[g_cnt]['transcript_status'] = TSTAT |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
365 gene[g_cnt]['transcripts'] = TRS |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
366 gene[g_cnt]['exons'] = EXON |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
367 gene[g_cnt]['utr5_exons'] = UTR5 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
368 gene[g_cnt]['cds_exons'] = CDS |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
369 gene[g_cnt]['utr3_exons'] = UTR3 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
370 gene[g_cnt]['transcript_type'] = TR_TYP |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
371 gene[g_cnt]['tis'] = TISc |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
372 gene[g_cnt]['cdsStop'] = CSTOP |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
373 gene[g_cnt]['tss'] = TSSc |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
374 gene[g_cnt]['cleave'] = CLV |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
375 gene[g_cnt]['transcript_score'] = TSCORE |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
376 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
377 gene[g_cnt]['gene_info'] = dict( ID = pkey[-1], |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
378 Name = pdet.get('name'), |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
379 Source = pkey[1]) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
380 # few empty fields // TODO fill this: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
381 gene[g_cnt]['anno_id'] = [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
382 gene[g_cnt]['confgenes_id'] = [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
383 gene[g_cnt]['alias'] = '' |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
384 gene[g_cnt]['name2'] = [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
385 gene[g_cnt]['chr_num'] = [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
386 gene[g_cnt]['paralogs'] = [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
387 gene[g_cnt]['transcript_valid'] = [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
388 gene[g_cnt]['exons_confirmed'] = [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
389 gene[g_cnt]['tis_conf'] = [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
390 gene[g_cnt]['tis_info'] = [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
391 gene[g_cnt]['cdsStop_conf'] = [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
392 gene[g_cnt]['cdsStop_info'] = [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
393 gene[g_cnt]['tss_info'] = [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
394 gene[g_cnt]['tss_conf'] = [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
395 gene[g_cnt]['cleave_info'] = [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
396 gene[g_cnt]['cleave_conf'] = [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
397 gene[g_cnt]['polya_info'] = [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
398 gene[g_cnt]['polya_conf'] = [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
399 gene[g_cnt]['is_valid'] = [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
400 gene[g_cnt]['transcript_complete'] = [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
401 gene[g_cnt]['is_complete'] = [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
402 gene[g_cnt]['is_correctly_gff3_referenced'] = '' |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
403 gene[g_cnt]['splicegraph'] = [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
404 g_cnt += 1 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
405 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
406 ## deleting empty gene records from the main array |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
407 XPFLG=0 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
408 for XP, ens in enumerate(gene): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
409 if ens[0]==0: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
410 XPFLG=1 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
411 break |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
412 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
413 if XPFLG==1: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
414 XQC = range(XP, len(gene)+1) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
415 gene = np.delete(gene, XQC) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
416 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
417 return gene |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
418 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
419 def NonetoemptyList(XS): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
420 """ |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
421 Convert a None type to empty list |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
422 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
423 @args XS: None type |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
424 @type XS: str |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
425 """ |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
426 return [] if XS is None else XS |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
427 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
428 def create_missing_feature_type(p_feat, c_feat): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
429 """ |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
430 GFF/GTF file defines only child features. This function tries to create |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
431 the parent feature from the information provided in the attribute column. |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
432 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
433 example: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
434 chr21 hg19_knownGene exon 9690071 9690100 0.000000 + . gene_id "uc002zkg.1"; transcript_id "uc002zkg.1"; |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
435 chr21 hg19_knownGene exon 9692178 9692207 0.000000 + . gene_id "uc021wgt.1"; transcript_id "uc021wgt.1"; |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
436 chr21 hg19_knownGene exon 9711935 9712038 0.000000 + . gene_id "uc011abu.2"; transcript_id "uc011abu.2"; |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
437 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
438 This function gets the parsed feature annotations. |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
439 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
440 @args p_feat: Parent feature map |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
441 @type p_feat: collections defaultdict |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
442 @args c_feat: Child feature map |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
443 @type c_feat: collections defaultdict |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
444 """ |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
445 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
446 child_n_map = defaultdict(list) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
447 for fid, det in c_feat.items(): |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
448 # get the details from grand child |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
449 GID = STRD = SCR = None |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
450 SPOS, EPOS = [], [] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
451 TYP = dict() |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
452 for gchild in det: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
453 GID = gchild.get('gene_id', [''])[0] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
454 SPOS.append(gchild.get('location', [])[0]) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
455 EPOS.append(gchild.get('location', [])[1]) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
456 STRD = gchild.get('strand', '') |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
457 SCR = gchild.get('score', '') |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
458 if gchild.get('type', '') == "gene": ## gencode GTF file has this problem |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
459 continue |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
460 TYP[gchild.get('type', '')] = 1 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
461 SPOS.sort() |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
462 EPOS.sort() |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
463 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
464 # infer transcript type |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
465 transcript_type = 'transcript' |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
466 transcript_type = 'mRNA' if TYP.get('CDS', '') or TYP.get('cds', '') else transcript_type |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
467 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
468 # gene id and transcript id are same |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
469 transcript_id = fid[-1] |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
470 if GID == transcript_id: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
471 transcript_id = 'Transcript:' + str(GID) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
472 |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
473 # level -1 feature type |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
474 p_feat[(fid[0], fid[1], GID)] = dict( type = 'gene', |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
475 location = [], ## infer location based on multiple transcripts |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
476 strand = STRD, |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
477 name = GID ) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
478 # level -2 feature type |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
479 child_n_map[(fid[0], fid[1], GID)].append( |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
480 dict( type = transcript_type, |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
481 location = [SPOS[0], EPOS[-1]], |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
482 strand = STRD, |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
483 score = SCR, |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
484 ID = transcript_id, |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
485 gene_id = '' )) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
486 # reorganizing the grand child |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
487 for gchild in det: |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
488 child_n_map[(fid[0], fid[1], transcript_id)].append( |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
489 dict( type = gchild.get('type', ''), |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
490 location = gchild.get('location'), |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
491 strand = gchild.get('strand'), |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
492 ID = gchild.get('ID'), |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
493 score = gchild.get('score'), |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
494 gene_id = '' )) |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
495 return p_feat, child_n_map |
c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
vipints <vipin@cbio.mskcc.org>
parents:
diff
changeset
|
496 |