annotate MACARON-GenMed-LabEx/MACARON @ 0:c9636a827049 draft default tip

Uploaded
author waqas
date Wed, 12 Sep 2018 08:45:03 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c9636a827049 Uploaded
waqas
parents:
diff changeset
1 #!/usr/bin/python
c9636a827049 Uploaded
waqas
parents:
diff changeset
2
c9636a827049 Uploaded
waqas
parents:
diff changeset
3 #alpha version 0.7 (05/09/18)
c9636a827049 Uploaded
waqas
parents:
diff changeset
4
c9636a827049 Uploaded
waqas
parents:
diff changeset
5 """
c9636a827049 Uploaded
waqas
parents:
diff changeset
6 #Script to identify SNPs located within a genetic codon:
c9636a827049 Uploaded
waqas
parents:
diff changeset
7
c9636a827049 Uploaded
waqas
parents:
diff changeset
8 Annotation of SNPs (including synonymous and non-synonymous variants) located within the same genetic codon requires attention. This idea conflicts with the annotations observed by traditional annotation software widely used now a days. While looking at the combined effect within the framework of genetic codon, we have new / altered codons that code for new amino acid that can be predicted by using this MACARON python script.
c9636a827049 Uploaded
waqas
parents:
diff changeset
9
c9636a827049 Uploaded
waqas
parents:
diff changeset
10 #####------How to run quickly run MACARON-------
c9636a827049 Uploaded
waqas
parents:
diff changeset
11
c9636a827049 Uploaded
waqas
parents:
diff changeset
12 # python MACARON -i yourinputfile.vcf
c9636a827049 Uploaded
waqas
parents:
diff changeset
13
c9636a827049 Uploaded
waqas
parents:
diff changeset
14 # python MACARON -i /path/to/yourinputfile.vcf -o /path/to/MACARON_output.txt -f (INFO)_FIELD_HEADER --GATK /path/to/GenomeAnalysisTK.jar --HG_REF /path/to/hg.fasta --SNPEFF /path/to/snpeff.jar --SNPEFF_HG hg19
c9636a827049 Uploaded
waqas
parents:
diff changeset
15
c9636a827049 Uploaded
waqas
parents:
diff changeset
16 # python MACARON -i yourinputfile.vcf -o MACARON_output.txt --gatk4 (when using gatk versions >= 4.0)
c9636a827049 Uploaded
waqas
parents:
diff changeset
17
c9636a827049 Uploaded
waqas
parents:
diff changeset
18 """
c9636a827049 Uploaded
waqas
parents:
diff changeset
19
c9636a827049 Uploaded
waqas
parents:
diff changeset
20 import sys, os, time
c9636a827049 Uploaded
waqas
parents:
diff changeset
21 import itertools
c9636a827049 Uploaded
waqas
parents:
diff changeset
22 import multiprocessing
c9636a827049 Uploaded
waqas
parents:
diff changeset
23 import re
c9636a827049 Uploaded
waqas
parents:
diff changeset
24 import subprocess
c9636a827049 Uploaded
waqas
parents:
diff changeset
25 from argparse import ArgumentParser
c9636a827049 Uploaded
waqas
parents:
diff changeset
26
c9636a827049 Uploaded
waqas
parents:
diff changeset
27 ## GLOBAL VARIABLES (IMPORTANT: You can set the default values here)
c9636a827049 Uploaded
waqas
parents:
diff changeset
28 GATK="/home/wuk/software/GenomeAnalysisTK.jar"
c9636a827049 Uploaded
waqas
parents:
diff changeset
29 #GATK="/home/wuk/software/gatk-4.0.1.2/gatk-package-4.0.1.2-local.jar"
c9636a827049 Uploaded
waqas
parents:
diff changeset
30 HG_REF="/home/wuk/Working/gnme_refrnces/Homo_sapiens_assembly19.fasta"
c9636a827049 Uploaded
waqas
parents:
diff changeset
31 SNPEFF="/home/wuk/software/snpEff/snpEff.jar"
c9636a827049 Uploaded
waqas
parents:
diff changeset
32 SNPEFF_HG="GRCh37.75" ## SnpEff human genome annotation database version
c9636a827049 Uploaded
waqas
parents:
diff changeset
33
c9636a827049 Uploaded
waqas
parents:
diff changeset
34 ## PRINTINGS, AESTHETICS
c9636a827049 Uploaded
waqas
parents:
diff changeset
35 str_progress_list = ["\tIndexing VCF file",
c9636a827049 Uploaded
waqas
parents:
diff changeset
36 "\tIdentifying SnpClusters",
c9636a827049 Uploaded
waqas
parents:
diff changeset
37 "\tExtracting SnpClusters",
c9636a827049 Uploaded
waqas
parents:
diff changeset
38 "\tAnnotating SnpClusters",
c9636a827049 Uploaded
waqas
parents:
diff changeset
39 "\tExcluding InDels",
c9636a827049 Uploaded
waqas
parents:
diff changeset
40 "\tGenerating a SnpCluster Table",
c9636a827049 Uploaded
waqas
parents:
diff changeset
41 "\tRe-annotating Codons",
c9636a827049 Uploaded
waqas
parents:
diff changeset
42 "\tRemoving SnpCluster if AA_Change_pcSNV == (AA1 or AA2)",
c9636a827049 Uploaded
waqas
parents:
diff changeset
43 "\tExtracting established SnpClusters - for which two (or three) SNPs are reference heterozygous (or non-reference homozygous)"]
c9636a827049 Uploaded
waqas
parents:
diff changeset
44 header = ("\n" +
c9636a827049 Uploaded
waqas
parents:
diff changeset
45 "(###############)\n" +
c9636a827049 Uploaded
waqas
parents:
diff changeset
46 "@@@@ MACARON @@@@\n" +
c9636a827049 Uploaded
waqas
parents:
diff changeset
47 "(###############)\n\n" +
c9636a827049 Uploaded
waqas
parents:
diff changeset
48 "Starting....\n")
c9636a827049 Uploaded
waqas
parents:
diff changeset
49 footer = "\nMACARON Run Completed. Bon Courage with Analysis ...,,,!!!!\n"
c9636a827049 Uploaded
waqas
parents:
diff changeset
50
c9636a827049 Uploaded
waqas
parents:
diff changeset
51 def animate(keep_anim, idx):
c9636a827049 Uploaded
waqas
parents:
diff changeset
52 c = "|/-\\"
c9636a827049 Uploaded
waqas
parents:
diff changeset
53 i=0
c9636a827049 Uploaded
waqas
parents:
diff changeset
54 while keep_anim.is_set():
c9636a827049 Uploaded
waqas
parents:
diff changeset
55 i += 1
c9636a827049 Uploaded
waqas
parents:
diff changeset
56 sys.stdout.write("\r>" + str_progress_list[idx] + ": " + c[i % len(c)] +"\r")
c9636a827049 Uploaded
waqas
parents:
diff changeset
57 sys.stdout.flush()
c9636a827049 Uploaded
waqas
parents:
diff changeset
58 time.sleep(0.1)
c9636a827049 Uploaded
waqas
parents:
diff changeset
59 sys.stdout.write("\r " + str_progress_list[idx] + ": Done!\n")
c9636a827049 Uploaded
waqas
parents:
diff changeset
60 def print_step(keep_anim, idx):
c9636a827049 Uploaded
waqas
parents:
diff changeset
61 if not(ECO):
c9636a827049 Uploaded
waqas
parents:
diff changeset
62 keep_anim.set()
c9636a827049 Uploaded
waqas
parents:
diff changeset
63 anim_thread = multiprocessing.Process(target=animate, args=(keep_anim, idx))
c9636a827049 Uploaded
waqas
parents:
diff changeset
64 anim_thread.start()
c9636a827049 Uploaded
waqas
parents:
diff changeset
65 return anim_thread
c9636a827049 Uploaded
waqas
parents:
diff changeset
66 else:
c9636a827049 Uploaded
waqas
parents:
diff changeset
67 sys.stdout.write(">" + str_progress_list[idx] + ": in progress...\r" )
c9636a827049 Uploaded
waqas
parents:
diff changeset
68 sys.stdout.flush()
c9636a827049 Uploaded
waqas
parents:
diff changeset
69 return None
c9636a827049 Uploaded
waqas
parents:
diff changeset
70 def end_print_step(keep_anim, anim_thread, idx):
c9636a827049 Uploaded
waqas
parents:
diff changeset
71 if not(ECO):
c9636a827049 Uploaded
waqas
parents:
diff changeset
72 keep_anim.clear(); anim_thread.join()
c9636a827049 Uploaded
waqas
parents:
diff changeset
73 else:
c9636a827049 Uploaded
waqas
parents:
diff changeset
74 sys.stdout.write(" " + str_progress_list[idx] + ": Done! \n")
c9636a827049 Uploaded
waqas
parents:
diff changeset
75 sys.stdout.flush()
c9636a827049 Uploaded
waqas
parents:
diff changeset
76
c9636a827049 Uploaded
waqas
parents:
diff changeset
77
c9636a827049 Uploaded
waqas
parents:
diff changeset
78 ## CLASS DEFINITIONS
c9636a827049 Uploaded
waqas
parents:
diff changeset
79
c9636a827049 Uploaded
waqas
parents:
diff changeset
80 class fileHandler:
c9636a827049 Uploaded
waqas
parents:
diff changeset
81 def __init__(self):
c9636a827049 Uploaded
waqas
parents:
diff changeset
82 self.data = [];
c9636a827049 Uploaded
waqas
parents:
diff changeset
83 def open_file(self, readfl):
c9636a827049 Uploaded
waqas
parents:
diff changeset
84 self.rfile = open(readfl, 'r').readlines()
c9636a827049 Uploaded
waqas
parents:
diff changeset
85 return self.rfile
c9636a827049 Uploaded
waqas
parents:
diff changeset
86 def write_file(self, writefl):
c9636a827049 Uploaded
waqas
parents:
diff changeset
87 self.wfile = open(writefl, 'w')
c9636a827049 Uploaded
waqas
parents:
diff changeset
88 return self.wfile
c9636a827049 Uploaded
waqas
parents:
diff changeset
89
c9636a827049 Uploaded
waqas
parents:
diff changeset
90 class SearchDB(fileHandler):
c9636a827049 Uploaded
waqas
parents:
diff changeset
91 def __init__(self):
c9636a827049 Uploaded
waqas
parents:
diff changeset
92 self.data = []
c9636a827049 Uploaded
waqas
parents:
diff changeset
93 from collections import defaultdict
c9636a827049 Uploaded
waqas
parents:
diff changeset
94 self.ident_ranges_HMBM = defaultdict(list)
c9636a827049 Uploaded
waqas
parents:
diff changeset
95 def Search_CODON(self, vcf_input, workdir, FIELDS):
c9636a827049 Uploaded
waqas
parents:
diff changeset
96 """
c9636a827049 Uploaded
waqas
parents:
diff changeset
97 Calling SNPClusters
c9636a827049 Uploaded
waqas
parents:
diff changeset
98
c9636a827049 Uploaded
waqas
parents:
diff changeset
99 USAGE INSTRUCTIONS: Full path to the software directories should be set before compiling.
c9636a827049 Uploaded
waqas
parents:
diff changeset
100 """
c9636a827049 Uploaded
waqas
parents:
diff changeset
101
c9636a827049 Uploaded
waqas
parents:
diff changeset
102 #######################
c9636a827049 Uploaded
waqas
parents:
diff changeset
103 Fld_Len = int(len(FIELDS.split(",")))
c9636a827049 Uploaded
waqas
parents:
diff changeset
104 FldwithF = " ".join(["-F " + str(x) for x in FIELDS.split(",")])
c9636a827049 Uploaded
waqas
parents:
diff changeset
105
c9636a827049 Uploaded
waqas
parents:
diff changeset
106 ## Options compatible with GATK versions >= 4.0: add option --gatk4 when calling MACARON
c9636a827049 Uploaded
waqas
parents:
diff changeset
107 if GATK4:
c9636a827049 Uploaded
waqas
parents:
diff changeset
108 GATK_v, SNPeff_v = (" ", " -v ") if VERBOSE else (" --QUIET true --verbosity ERROR ", " ")
c9636a827049 Uploaded
waqas
parents:
diff changeset
109 cmd0 = "java -Xmx12g -jar " + GATK + " IndexFeatureFile -F " + vcf_input + GATK_v
c9636a827049 Uploaded
waqas
parents:
diff changeset
110 cmd1 = "java -Xmx12g -jar " + GATK + " VariantFiltration -R " + HG_REF + " -V "+ vcf_input +" -O " + workdir + "snp_clsters2_ws3.vcf --cluster-size 2 --cluster-window-size 3" + GATK_v
c9636a827049 Uploaded
waqas
parents:
diff changeset
111 cmd2 = "java -Xmx12g -jar " + GATK + " SelectVariants -R " + HG_REF + " -V " + workdir + "snp_clsters2_ws3.vcf -O " + workdir + "snp_clsters2_ws3_clstronly.vcf -select 'FILTER == SnpCluster'" + GATK_v
c9636a827049 Uploaded
waqas
parents:
diff changeset
112 cmd3 = "java -Xmx12g -jar " + SNPEFF + SNPeff_v + SNPEFF_HG + " -formatEff -lof -classic " + workdir + "snp_clsters2_ws3_clstronly.vcf > " + workdir + "snp_clsters2_ws3_clstronly_annt.vcf"
c9636a827049 Uploaded
waqas
parents:
diff changeset
113 cmd4 = "java -Xmx12g -jar " + GATK + " SelectVariants -R " + HG_REF + " -V " + workdir + "snp_clsters2_ws3_clstronly_annt.vcf -O " + workdir + "snp_clsters2_ws3_clstronly_annt_snv.vcf --select-type-to-include SNP" + GATK_v
c9636a827049 Uploaded
waqas
parents:
diff changeset
114 cmd5 = "java -Xmx12g -jar " + GATK + " VariantsToTable -R " + HG_REF + " -V " + workdir + "snp_clsters2_ws3_clstronly_annt_snv.vcf -F CHROM -F POS -F ID -F REF -F ALT -F EFF " + FldwithF + " -GF GT --error-if-missing-data --show-filtered -O " + workdir + "snp_clsters2_ws3_clstronly_annt_snv_clstronly.table" + GATK_v
c9636a827049 Uploaded
waqas
parents:
diff changeset
115 ## GATK4 needs to index the VCF upfront
c9636a827049 Uploaded
waqas
parents:
diff changeset
116 thread = print_step(keep_anim, 0)
c9636a827049 Uploaded
waqas
parents:
diff changeset
117 subprocess.check_output(cmd0, shell=True)
c9636a827049 Uploaded
waqas
parents:
diff changeset
118 end_print_step(keep_anim, thread, 0)
c9636a827049 Uploaded
waqas
parents:
diff changeset
119
c9636a827049 Uploaded
waqas
parents:
diff changeset
120 else: ## Options comptatible with GATK versions < 4
c9636a827049 Uploaded
waqas
parents:
diff changeset
121 GATK_v, SNPeff_v = (" ", " -v ") if VERBOSE else (" --logging_level ERROR ", " ")
c9636a827049 Uploaded
waqas
parents:
diff changeset
122 cmd1 = "java -Xmx4g -jar " + GATK + " -T VariantFiltration -R " + HG_REF + " -V " + vcf_input + " -o " + workdir + "snp_clsters2_ws3.vcf --clusterSize 2 --clusterWindowSize 3" + GATK_v
c9636a827049 Uploaded
waqas
parents:
diff changeset
123 cmd2 = "java -Xmx4g -jar " + GATK + " -T SelectVariants -R " + HG_REF + " -V " + workdir + "snp_clsters2_ws3.vcf -o " + workdir + "snp_clsters2_ws3_clstronly.vcf -select 'FILTER == SnpCluster'" + GATK_v
c9636a827049 Uploaded
waqas
parents:
diff changeset
124 cmd3 = "java -Xmx4g -jar " + SNPEFF + SNPeff_v + SNPEFF_HG + " -formatEff -lof -classic " + workdir + "snp_clsters2_ws3_clstronly.vcf > " + workdir + "snp_clsters2_ws3_clstronly_annt.vcf"
c9636a827049 Uploaded
waqas
parents:
diff changeset
125 cmd4 = "java -Xmx4g -jar " + GATK + " -T SelectVariants -R " + HG_REF + " -V " + workdir + "snp_clsters2_ws3_clstronly_annt.vcf -o " + workdir + "snp_clsters2_ws3_clstronly_annt_snv.vcf --selectTypeToInclude SNP" + GATK_v
c9636a827049 Uploaded
waqas
parents:
diff changeset
126 cmd5 = "java -Xmx4g -jar " + GATK + " -T VariantsToTable -R " + HG_REF + " -V " + workdir + "snp_clsters2_ws3_clstronly_annt_snv.vcf -F CHROM -F POS -F ID -F REF -F ALT -F EFF " + FldwithF + " -GF GT --showFiltered -o " + workdir + "snp_clsters2_ws3_clstronly_annt_snv_clstronly.table" + GATK_v
c9636a827049 Uploaded
waqas
parents:
diff changeset
127
c9636a827049 Uploaded
waqas
parents:
diff changeset
128 thread = print_step(keep_anim, 1)
c9636a827049 Uploaded
waqas
parents:
diff changeset
129 subprocess.check_output(cmd1, shell=True)
c9636a827049 Uploaded
waqas
parents:
diff changeset
130 end_print_step(keep_anim, thread, 1)
c9636a827049 Uploaded
waqas
parents:
diff changeset
131 thread = print_step(keep_anim, 2)
c9636a827049 Uploaded
waqas
parents:
diff changeset
132 subprocess.check_output(cmd2, shell=True)
c9636a827049 Uploaded
waqas
parents:
diff changeset
133 end_print_step(keep_anim, thread, 2)
c9636a827049 Uploaded
waqas
parents:
diff changeset
134 thread = print_step(keep_anim, 3)
c9636a827049 Uploaded
waqas
parents:
diff changeset
135 subprocess.check_output(cmd3, shell=True)
c9636a827049 Uploaded
waqas
parents:
diff changeset
136 end_print_step(keep_anim, thread, 3)
c9636a827049 Uploaded
waqas
parents:
diff changeset
137 thread = print_step(keep_anim, 4)
c9636a827049 Uploaded
waqas
parents:
diff changeset
138 subprocess.check_output(cmd4, shell=True)
c9636a827049 Uploaded
waqas
parents:
diff changeset
139 end_print_step(keep_anim, thread, 4)
c9636a827049 Uploaded
waqas
parents:
diff changeset
140 thread = print_step(keep_anim, 5)
c9636a827049 Uploaded
waqas
parents:
diff changeset
141 subprocess.check_output(cmd5, shell=True)
c9636a827049 Uploaded
waqas
parents:
diff changeset
142 end_print_step(keep_anim, thread, 5)
c9636a827049 Uploaded
waqas
parents:
diff changeset
143
c9636a827049 Uploaded
waqas
parents:
diff changeset
144 subprocess.check_output("rm snpEff_genes.txt", shell=True)
c9636a827049 Uploaded
waqas
parents:
diff changeset
145 subprocess.check_output("rm snpEff_summary.html", shell=True)
c9636a827049 Uploaded
waqas
parents:
diff changeset
146
c9636a827049 Uploaded
waqas
parents:
diff changeset
147 ####--------------------------------------
c9636a827049 Uploaded
waqas
parents:
diff changeset
148
c9636a827049 Uploaded
waqas
parents:
diff changeset
149 def Change_zygo(ref, alt, zyg):
c9636a827049 Uploaded
waqas
parents:
diff changeset
150 """
c9636a827049 Uploaded
waqas
parents:
diff changeset
151 program to convert zygosity code ref/alt to 0/1.
c9636a827049 Uploaded
waqas
parents:
diff changeset
152 Input Variables:
c9636a827049 Uploaded
waqas
parents:
diff changeset
153
c9636a827049 Uploaded
waqas
parents:
diff changeset
154 ref = "A";alt = "G"
c9636a827049 Uploaded
waqas
parents:
diff changeset
155 zyg = ['A/G', 'G/A', 'G/G', 'A/A', './.', 'G/.', './G', './A', 'A/.']
c9636a827049 Uploaded
waqas
parents:
diff changeset
156 chzyg = ['0/1', '1/0', '1/1', '0/0', './.', '1/.', './1', './0', '0/.']
c9636a827049 Uploaded
waqas
parents:
diff changeset
157 InputUsage Variables:
c9636a827049 Uploaded
waqas
parents:
diff changeset
158 chgz = Change_zygo(ref, alt, zyg)
c9636a827049 Uploaded
waqas
parents:
diff changeset
159
c9636a827049 Uploaded
waqas
parents:
diff changeset
160 """
c9636a827049 Uploaded
waqas
parents:
diff changeset
161 import re
c9636a827049 Uploaded
waqas
parents:
diff changeset
162 chg_zyg = {}; i = 1
c9636a827049 Uploaded
waqas
parents:
diff changeset
163 for cs in zyg:
c9636a827049 Uploaded
waqas
parents:
diff changeset
164 csp = re.split('[|/]', cs)
c9636a827049 Uploaded
waqas
parents:
diff changeset
165 if ((ref == csp[0]) and (ref == csp[1]) and ((csp[0] != ".") and (csp[1] != "."))):
c9636a827049 Uploaded
waqas
parents:
diff changeset
166 chg_zyg[i] = "0/0"
c9636a827049 Uploaded
waqas
parents:
diff changeset
167 elif ((ref != csp[0]) and (ref == csp[1]) and ((csp[0] != ".") and (csp[1] != "."))):
c9636a827049 Uploaded
waqas
parents:
diff changeset
168 chg_zyg[i] = "1/0"
c9636a827049 Uploaded
waqas
parents:
diff changeset
169 elif ((ref == csp[0]) and (ref != csp[1]) and ((csp[0] != ".") and (csp[1] != "."))):
c9636a827049 Uploaded
waqas
parents:
diff changeset
170 chg_zyg[i] = "0/1"
c9636a827049 Uploaded
waqas
parents:
diff changeset
171 elif ((ref != csp[0]) and (ref != csp[1]) and ((csp[0] != ".") and (csp[1] != "."))):
c9636a827049 Uploaded
waqas
parents:
diff changeset
172 chg_zyg[i] = "1/1"
c9636a827049 Uploaded
waqas
parents:
diff changeset
173 elif ((csp[0] == ".") and (csp[1] == ".")):
c9636a827049 Uploaded
waqas
parents:
diff changeset
174 chg_zyg[i] = cs
c9636a827049 Uploaded
waqas
parents:
diff changeset
175 elif ((csp[1] == ".")):
c9636a827049 Uploaded
waqas
parents:
diff changeset
176 if (ref == csp[0]):
c9636a827049 Uploaded
waqas
parents:
diff changeset
177 chg_zyg[i] = "0/."
c9636a827049 Uploaded
waqas
parents:
diff changeset
178 elif (ref != csp[0]):
c9636a827049 Uploaded
waqas
parents:
diff changeset
179 chg_zyg[i] = "1/."
c9636a827049 Uploaded
waqas
parents:
diff changeset
180 elif ((csp[0] == ".")):
c9636a827049 Uploaded
waqas
parents:
diff changeset
181 if (ref == csp[1]):
c9636a827049 Uploaded
waqas
parents:
diff changeset
182 chg_zyg[i] = "./0"
c9636a827049 Uploaded
waqas
parents:
diff changeset
183 elif (ref != csp[1]):
c9636a827049 Uploaded
waqas
parents:
diff changeset
184 chg_zyg[i] = "./1"
c9636a827049 Uploaded
waqas
parents:
diff changeset
185 i += 1
c9636a827049 Uploaded
waqas
parents:
diff changeset
186 return list(chg_zyg.values())
c9636a827049 Uploaded
waqas
parents:
diff changeset
187 ####--------------------------------------
c9636a827049 Uploaded
waqas
parents:
diff changeset
188
c9636a827049 Uploaded
waqas
parents:
diff changeset
189 with open(workdir + "snp_clsters2_ws3_clstronly_annt_snv_clstronly.table", 'r') as f1, open(workdir + "temp_file1", 'w') as output:
c9636a827049 Uploaded
waqas
parents:
diff changeset
190 first_line = f1.readline().strip()
c9636a827049 Uploaded
waqas
parents:
diff changeset
191 zyg_head = '\t'.join(first_line.split()[6+Fld_Len:])
c9636a827049 Uploaded
waqas
parents:
diff changeset
192 output.write(first_line + "\t" + zyg_head + "\t" + str("Protein_coding_EFF AA-Change REF-codon ALT-codon") + "\n")
c9636a827049 Uploaded
waqas
parents:
diff changeset
193 for line in f1:
c9636a827049 Uploaded
waqas
parents:
diff changeset
194 line1 = line.strip()
c9636a827049 Uploaded
waqas
parents:
diff changeset
195 line_TAB = line1.split("\t")
c9636a827049 Uploaded
waqas
parents:
diff changeset
196 line_EFF = line_TAB[5].split("|")
c9636a827049 Uploaded
waqas
parents:
diff changeset
197 if (len(line_EFF) > 1):
c9636a827049 Uploaded
waqas
parents:
diff changeset
198 if ((line_EFF[1] == "SILENT") or (line_EFF[1] == "MISSENSE") or (line_EFF[1] == "NONSENSE")):
c9636a827049 Uploaded
waqas
parents:
diff changeset
199 True
c9636a827049 Uploaded
waqas
parents:
diff changeset
200 linesp = line_EFF[2].split("/")
c9636a827049 Uploaded
waqas
parents:
diff changeset
201 ref = line_TAB[3]
c9636a827049 Uploaded
waqas
parents:
diff changeset
202 alt = line_TAB[4]
c9636a827049 Uploaded
waqas
parents:
diff changeset
203 zyg = line_TAB[6+Fld_Len:]
c9636a827049 Uploaded
waqas
parents:
diff changeset
204 chgz = Change_zygo(ref, alt, zyg)
c9636a827049 Uploaded
waqas
parents:
diff changeset
205 chgz_out = '\t'.join(chgz)
c9636a827049 Uploaded
waqas
parents:
diff changeset
206 wrt = str(line_EFF[1] + "\t" + line_EFF[3] + "\t" + linesp[0] + "\t" + linesp[1])
c9636a827049 Uploaded
waqas
parents:
diff changeset
207 output.write(line1 + "\t" + chgz_out + "\t" + wrt + "\n")
c9636a827049 Uploaded
waqas
parents:
diff changeset
208 return None
c9636a827049 Uploaded
waqas
parents:
diff changeset
209
c9636a827049 Uploaded
waqas
parents:
diff changeset
210 ##-------------------------------
c9636a827049 Uploaded
waqas
parents:
diff changeset
211 import string
c9636a827049 Uploaded
waqas
parents:
diff changeset
212 gencode = {'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T','AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R','CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P','CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R','GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A','GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G','TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L','TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W'}
c9636a827049 Uploaded
waqas
parents:
diff changeset
213
c9636a827049 Uploaded
waqas
parents:
diff changeset
214 # a function to translate a single codon
c9636a827049 Uploaded
waqas
parents:
diff changeset
215 def translate_codon(self, codon):
c9636a827049 Uploaded
waqas
parents:
diff changeset
216 return self.gencode.get(codon.upper(), '#')
c9636a827049 Uploaded
waqas
parents:
diff changeset
217 # a function to split a sequence into codons
c9636a827049 Uploaded
waqas
parents:
diff changeset
218 def split_into_codons(self, dna, frame):
c9636a827049 Uploaded
waqas
parents:
diff changeset
219 codons = []
c9636a827049 Uploaded
waqas
parents:
diff changeset
220 for i in range(frame-1, len(dna)-2, 3):
c9636a827049 Uploaded
waqas
parents:
diff changeset
221 codon = dna[i:i+3]
c9636a827049 Uploaded
waqas
parents:
diff changeset
222 codons.append(codon)
c9636a827049 Uploaded
waqas
parents:
diff changeset
223 return codons
c9636a827049 Uploaded
waqas
parents:
diff changeset
224 # a function to translate a dna sequence in a single frame
c9636a827049 Uploaded
waqas
parents:
diff changeset
225 def translate_dna_single(self, dna, frame=1):
c9636a827049 Uploaded
waqas
parents:
diff changeset
226 codons = self.split_into_codons(dna, frame)
c9636a827049 Uploaded
waqas
parents:
diff changeset
227 amino_acids = ''
c9636a827049 Uploaded
waqas
parents:
diff changeset
228 for codon in codons:
c9636a827049 Uploaded
waqas
parents:
diff changeset
229 amino_acids = amino_acids + self.translate_codon(codon)
c9636a827049 Uploaded
waqas
parents:
diff changeset
230 return amino_acids
c9636a827049 Uploaded
waqas
parents:
diff changeset
231
c9636a827049 Uploaded
waqas
parents:
diff changeset
232 def TWO_VAR(self, workdir):
c9636a827049 Uploaded
waqas
parents:
diff changeset
233 """
c9636a827049 Uploaded
waqas
parents:
diff changeset
234 ###---Two variants (2VAR) codon changes---
c9636a827049 Uploaded
waqas
parents:
diff changeset
235 ##----------------------------------
c9636a827049 Uploaded
waqas
parents:
diff changeset
236 """
c9636a827049 Uploaded
waqas
parents:
diff changeset
237 lines = open(workdir + "temp_file1", "r").read().splitlines()
c9636a827049 Uploaded
waqas
parents:
diff changeset
238 writ2 = self.write_file(workdir + "temp_file2")
c9636a827049 Uploaded
waqas
parents:
diff changeset
239 #---
c9636a827049 Uploaded
waqas
parents:
diff changeset
240 midline_head = lines[0].strip().split("\t")
c9636a827049 Uploaded
waqas
parents:
diff changeset
241 try:
c9636a827049 Uploaded
waqas
parents:
diff changeset
242 midline_headcrp = '\t'.join([w.replace(midline_head[5], 'Gene_Name') for w in midline_head])
c9636a827049 Uploaded
waqas
parents:
diff changeset
243 except ValueError:
c9636a827049 Uploaded
waqas
parents:
diff changeset
244 pass
c9636a827049 Uploaded
waqas
parents:
diff changeset
245 writ2.write(str(midline_headcrp + "\t" + "ALT-codon_merge-2VAR" + "\t" + "AA-Change-2VAR") + "\n")
c9636a827049 Uploaded
waqas
parents:
diff changeset
246 #---
c9636a827049 Uploaded
waqas
parents:
diff changeset
247
c9636a827049 Uploaded
waqas
parents:
diff changeset
248 i=0; TRcode =""; protcode = ""; midline_crpit = ""
c9636a827049 Uploaded
waqas
parents:
diff changeset
249 for i in range(len(lines)):
c9636a827049 Uploaded
waqas
parents:
diff changeset
250 #---
c9636a827049 Uploaded
waqas
parents:
diff changeset
251 midline_crp = lines[i].strip().split("\t")
c9636a827049 Uploaded
waqas
parents:
diff changeset
252 try:
c9636a827049 Uploaded
waqas
parents:
diff changeset
253 if len(midline_crp[5].split("|")) != 1:
c9636a827049 Uploaded
waqas
parents:
diff changeset
254 GeneName = midline_crp[5].split("|")[5]
c9636a827049 Uploaded
waqas
parents:
diff changeset
255 midline_crpit = '\t'.join([w.replace(midline_crp[5], GeneName) for w in midline_crp])
c9636a827049 Uploaded
waqas
parents:
diff changeset
256 except ValueError:
c9636a827049 Uploaded
waqas
parents:
diff changeset
257 pass
c9636a827049 Uploaded
waqas
parents:
diff changeset
258 #---
c9636a827049 Uploaded
waqas
parents:
diff changeset
259 try:
c9636a827049 Uploaded
waqas
parents:
diff changeset
260 beforeline = lines[i-1].strip()
c9636a827049 Uploaded
waqas
parents:
diff changeset
261 line0 = beforeline.split("\t")[-3]
c9636a827049 Uploaded
waqas
parents:
diff changeset
262 beforeline1 = re.findall("\d+", line0)
c9636a827049 Uploaded
waqas
parents:
diff changeset
263
c9636a827049 Uploaded
waqas
parents:
diff changeset
264 midline = lines[i].strip()
c9636a827049 Uploaded
waqas
parents:
diff changeset
265 line1 = midline.split("\t")[-3]
c9636a827049 Uploaded
waqas
parents:
diff changeset
266 midline1 = re.findall("\d+", line1)
c9636a827049 Uploaded
waqas
parents:
diff changeset
267
c9636a827049 Uploaded
waqas
parents:
diff changeset
268 nextline = lines[i+1].strip()
c9636a827049 Uploaded
waqas
parents:
diff changeset
269 line2 = nextline.split("\t")[-3]
c9636a827049 Uploaded
waqas
parents:
diff changeset
270 nextline1 = re.findall("\d+", line2)
c9636a827049 Uploaded
waqas
parents:
diff changeset
271 #----Condition to replace empty list ([] to ['000']) produced from the header column "AA-Change".
c9636a827049 Uploaded
waqas
parents:
diff changeset
272 if (line0 == "AA-Change"):
c9636a827049 Uploaded
waqas
parents:
diff changeset
273 beforeline1.append('000')
c9636a827049 Uploaded
waqas
parents:
diff changeset
274 elif (line1 == "AA-Change"):
c9636a827049 Uploaded
waqas
parents:
diff changeset
275 midline1.append('000')
c9636a827049 Uploaded
waqas
parents:
diff changeset
276 elif (line2 == "AA-Change"):
c9636a827049 Uploaded
waqas
parents:
diff changeset
277 nextline1.append('000')
c9636a827049 Uploaded
waqas
parents:
diff changeset
278 #----
c9636a827049 Uploaded
waqas
parents:
diff changeset
279 REFbf=[]; line22=""
c9636a827049 Uploaded
waqas
parents:
diff changeset
280 if ((beforeline1[0] == midline1[0]) or (midline1[0] == nextline1[0])):
c9636a827049 Uploaded
waqas
parents:
diff changeset
281 spREFbf=[]; lscod1=[]; lscod2=[]
c9636a827049 Uploaded
waqas
parents:
diff changeset
282 REF= lines[i].strip().split("\t")[-2]
c9636a827049 Uploaded
waqas
parents:
diff changeset
283 if (midline1[0] == beforeline1[0]):
c9636a827049 Uploaded
waqas
parents:
diff changeset
284 REFbf = lines[i-1].strip().split("\t")[-2]
c9636a827049 Uploaded
waqas
parents:
diff changeset
285
c9636a827049 Uploaded
waqas
parents:
diff changeset
286 line11 = lines[i].strip().split("\t")[-1]
c9636a827049 Uploaded
waqas
parents:
diff changeset
287 if (midline1[0] == nextline1[0]):
c9636a827049 Uploaded
waqas
parents:
diff changeset
288 line22 = lines[i+1].strip().split("\t")[-1]
c9636a827049 Uploaded
waqas
parents:
diff changeset
289 if REFbf != []:
c9636a827049 Uploaded
waqas
parents:
diff changeset
290 for cod in REFbf:
c9636a827049 Uploaded
waqas
parents:
diff changeset
291 spREFbf.append(cod)
c9636a827049 Uploaded
waqas
parents:
diff changeset
292 for cod in line11:
c9636a827049 Uploaded
waqas
parents:
diff changeset
293 lscod1.append(cod)
c9636a827049 Uploaded
waqas
parents:
diff changeset
294 for cod in line22:
c9636a827049 Uploaded
waqas
parents:
diff changeset
295 lscod2.append(cod)
c9636a827049 Uploaded
waqas
parents:
diff changeset
296
c9636a827049 Uploaded
waqas
parents:
diff changeset
297 if (((lscod1[0].islower()==True and lscod2[0].islower()==True) or (lscod1[1].islower()==True and lscod2[1].islower()==True) or (lscod1[2].islower()==True and lscod2[2].islower()==True)) and ((lscod1[0] == lscod2[0]) or (lscod1[1] == lscod2[1]) or (lscod1[2] == lscod2[2]))):
c9636a827049 Uploaded
waqas
parents:
diff changeset
298 threeltr_code = []
c9636a827049 Uploaded
waqas
parents:
diff changeset
299 if ((lscod1[0].isupper()==True and lscod2[0].islower()==True) or (lscod1[0] == lscod2[0])):
c9636a827049 Uploaded
waqas
parents:
diff changeset
300 threeltr_code.append(lscod1[0])
c9636a827049 Uploaded
waqas
parents:
diff changeset
301 elif((lscod1[0].islower()==True and lscod2[0].isupper()==True) or (lscod1[0] == lscod2[0])):
c9636a827049 Uploaded
waqas
parents:
diff changeset
302 threeltr_code.append(lscod2[0])
c9636a827049 Uploaded
waqas
parents:
diff changeset
303
c9636a827049 Uploaded
waqas
parents:
diff changeset
304 if((lscod1[1].isupper()==True and lscod2[1].islower()==True) or (lscod1[1] == lscod2[1])):
c9636a827049 Uploaded
waqas
parents:
diff changeset
305 threeltr_code.append(lscod1[1])
c9636a827049 Uploaded
waqas
parents:
diff changeset
306 elif((lscod1[1].islower()==True and lscod2[1].isupper()==True) or (lscod1[1] == lscod2[1])):
c9636a827049 Uploaded
waqas
parents:
diff changeset
307 threeltr_code.append(lscod2[1])
c9636a827049 Uploaded
waqas
parents:
diff changeset
308
c9636a827049 Uploaded
waqas
parents:
diff changeset
309 if((lscod1[2].isupper()==True and lscod2[2].islower()==True) or (lscod1[2] == lscod2[2])):
c9636a827049 Uploaded
waqas
parents:
diff changeset
310 threeltr_code.append(lscod1[2])
c9636a827049 Uploaded
waqas
parents:
diff changeset
311 elif((lscod1[2].islower()==True and lscod2[2].isupper()==True) or (lscod1[2] == lscod2[2])):
c9636a827049 Uploaded
waqas
parents:
diff changeset
312 threeltr_code.append(lscod2[2])
c9636a827049 Uploaded
waqas
parents:
diff changeset
313 #-----------------------
c9636a827049 Uploaded
waqas
parents:
diff changeset
314 if(len(threeltr_code)==3):
c9636a827049 Uploaded
waqas
parents:
diff changeset
315 TRcode = ''.join(threeltr_code)
c9636a827049 Uploaded
waqas
parents:
diff changeset
316 else:
c9636a827049 Uploaded
waqas
parents:
diff changeset
317 TRcode = 'Multiallelic-t1'
c9636a827049 Uploaded
waqas
parents:
diff changeset
318 #-----------------------
c9636a827049 Uploaded
waqas
parents:
diff changeset
319 else:
c9636a827049 Uploaded
waqas
parents:
diff changeset
320 TRcode = '...'
c9636a827049 Uploaded
waqas
parents:
diff changeset
321
c9636a827049 Uploaded
waqas
parents:
diff changeset
322 if(TRcode != line22):
c9636a827049 Uploaded
waqas
parents:
diff changeset
323 protcode = self.translate_dna_single(TRcode)
c9636a827049 Uploaded
waqas
parents:
diff changeset
324 else:
c9636a827049 Uploaded
waqas
parents:
diff changeset
325 TRcode = "."
c9636a827049 Uploaded
waqas
parents:
diff changeset
326
c9636a827049 Uploaded
waqas
parents:
diff changeset
327 if (REF == REFbf):
c9636a827049 Uploaded
waqas
parents:
diff changeset
328 protcode = "#####";TRcode = "Multiallelic-t2"
c9636a827049 Uploaded
waqas
parents:
diff changeset
329 writ2.write(str(midline_crpit + "\t" + TRcode + "\t" + protcode) + "\n")
c9636a827049 Uploaded
waqas
parents:
diff changeset
330
c9636a827049 Uploaded
waqas
parents:
diff changeset
331 except IndexError:
c9636a827049 Uploaded
waqas
parents:
diff changeset
332 if not midline_crpit.split("\t")[0] == "CHROM":
c9636a827049 Uploaded
waqas
parents:
diff changeset
333 writ2.write(str(midline_crpit + "\t" + "." + "\t" + protcode) + "\n")
c9636a827049 Uploaded
waqas
parents:
diff changeset
334 pass
c9636a827049 Uploaded
waqas
parents:
diff changeset
335 return None
c9636a827049 Uploaded
waqas
parents:
diff changeset
336
c9636a827049 Uploaded
waqas
parents:
diff changeset
337 def THREE_VAR(self, workdir):
c9636a827049 Uploaded
waqas
parents:
diff changeset
338 """
c9636a827049 Uploaded
waqas
parents:
diff changeset
339 ###---Three variants (3VAR) codon changes---
c9636a827049 Uploaded
waqas
parents:
diff changeset
340 ##----------------------------------
c9636a827049 Uploaded
waqas
parents:
diff changeset
341 """
c9636a827049 Uploaded
waqas
parents:
diff changeset
342 lines = open(workdir + "temp_file2", "r").read().splitlines()
c9636a827049 Uploaded
waqas
parents:
diff changeset
343 writ3 = self.write_file(workdir + "temp_file3")
c9636a827049 Uploaded
waqas
parents:
diff changeset
344 #---
c9636a827049 Uploaded
waqas
parents:
diff changeset
345 midline_head =lines[0].strip()
c9636a827049 Uploaded
waqas
parents:
diff changeset
346 writ3.write(str(midline_head + "\t" + "ALT-codon_merge-3VAR" + "\t" + "AA-Change-3VAR") + "\n")
c9636a827049 Uploaded
waqas
parents:
diff changeset
347 #---
c9636a827049 Uploaded
waqas
parents:
diff changeset
348 i = 0; TRcode = ""; protcode = ""
c9636a827049 Uploaded
waqas
parents:
diff changeset
349 for i in range(len(lines)):
c9636a827049 Uploaded
waqas
parents:
diff changeset
350 try:
c9636a827049 Uploaded
waqas
parents:
diff changeset
351 #---
c9636a827049 Uploaded
waqas
parents:
diff changeset
352 midline_crp = lines[i].strip()
c9636a827049 Uploaded
waqas
parents:
diff changeset
353 #---
c9636a827049 Uploaded
waqas
parents:
diff changeset
354 beforeline = lines[i-1].strip()
c9636a827049 Uploaded
waqas
parents:
diff changeset
355 line0 = beforeline.split("\t")[-5]
c9636a827049 Uploaded
waqas
parents:
diff changeset
356 beforeline1 = re.findall("\d+", line0)
c9636a827049 Uploaded
waqas
parents:
diff changeset
357 midline = lines[i].strip()
c9636a827049 Uploaded
waqas
parents:
diff changeset
358 line1 = midline.split("\t")[-5]
c9636a827049 Uploaded
waqas
parents:
diff changeset
359 midline1 = re.findall("\d+", line1)
c9636a827049 Uploaded
waqas
parents:
diff changeset
360 nextline = lines[i+1].strip()
c9636a827049 Uploaded
waqas
parents:
diff changeset
361 line2 = nextline.split("\t")[-5]
c9636a827049 Uploaded
waqas
parents:
diff changeset
362 nextline1 = re.findall("\d+", line2)
c9636a827049 Uploaded
waqas
parents:
diff changeset
363
c9636a827049 Uploaded
waqas
parents:
diff changeset
364 line11=[]; line22=[]; writelst= []
c9636a827049 Uploaded
waqas
parents:
diff changeset
365 if ((beforeline1[0] == midline1[0]) or (midline1[0] == nextline1[0])):
c9636a827049 Uploaded
waqas
parents:
diff changeset
366 lscod1=[]; lscod2=[]
c9636a827049 Uploaded
waqas
parents:
diff changeset
367 line11 = lines[i].strip().split("\t")[-2]
c9636a827049 Uploaded
waqas
parents:
diff changeset
368 if (midline1[0] == nextline1[0]):
c9636a827049 Uploaded
waqas
parents:
diff changeset
369 line22 = lines[i+1].strip().split("\t")[-2]
c9636a827049 Uploaded
waqas
parents:
diff changeset
370 for cod in line11:
c9636a827049 Uploaded
waqas
parents:
diff changeset
371 lscod1.append(cod)
c9636a827049 Uploaded
waqas
parents:
diff changeset
372 for cod in line22:
c9636a827049 Uploaded
waqas
parents:
diff changeset
373 lscod2.append(cod)
c9636a827049 Uploaded
waqas
parents:
diff changeset
374 #-----------------------
c9636a827049 Uploaded
waqas
parents:
diff changeset
375 if(len(lscod1) == 3 and len(lscod2) == 3):
c9636a827049 Uploaded
waqas
parents:
diff changeset
376 threeltr_code = [];
c9636a827049 Uploaded
waqas
parents:
diff changeset
377
c9636a827049 Uploaded
waqas
parents:
diff changeset
378 if ((lscod1[0].isupper()==True) and (lscod2[0].islower()==True) or
c9636a827049 Uploaded
waqas
parents:
diff changeset
379 (lscod1[0].isupper()==True and lscod2[0].isupper()==True)):
c9636a827049 Uploaded
waqas
parents:
diff changeset
380 threeltr_code.append(lscod1[0])
c9636a827049 Uploaded
waqas
parents:
diff changeset
381 elif ((lscod1[0].islower()==True) and (lscod2[0].isupper()==True)):
c9636a827049 Uploaded
waqas
parents:
diff changeset
382 threeltr_code.append(lscod2[0])
c9636a827049 Uploaded
waqas
parents:
diff changeset
383
c9636a827049 Uploaded
waqas
parents:
diff changeset
384 if ((lscod1[1].isupper()==True) and (lscod2[1].islower()==True) or
c9636a827049 Uploaded
waqas
parents:
diff changeset
385 (lscod1[1].isupper()==True and lscod2[1].isupper()==True)):
c9636a827049 Uploaded
waqas
parents:
diff changeset
386 threeltr_code.append(lscod1[1])
c9636a827049 Uploaded
waqas
parents:
diff changeset
387 elif ((lscod1[1].islower()==True) and (lscod2[1].isupper()==True)):
c9636a827049 Uploaded
waqas
parents:
diff changeset
388 threeltr_code.append(lscod2[1])
c9636a827049 Uploaded
waqas
parents:
diff changeset
389
c9636a827049 Uploaded
waqas
parents:
diff changeset
390 if ((lscod1[2].isupper()==True) and (lscod2[2].islower()==True) or
c9636a827049 Uploaded
waqas
parents:
diff changeset
391 (lscod1[2].isupper()==True and lscod2[2].isupper()==True)):
c9636a827049 Uploaded
waqas
parents:
diff changeset
392 threeltr_code.append(lscod1[2])
c9636a827049 Uploaded
waqas
parents:
diff changeset
393 elif ((lscod1[2].islower()==True) and (lscod2[2].isupper()==True)):
c9636a827049 Uploaded
waqas
parents:
diff changeset
394 threeltr_code.append(lscod2[2])
c9636a827049 Uploaded
waqas
parents:
diff changeset
395 #-----------------------
c9636a827049 Uploaded
waqas
parents:
diff changeset
396 if(len(threeltr_code) == 3):
c9636a827049 Uploaded
waqas
parents:
diff changeset
397 TRcode = ''.join(threeltr_code)
c9636a827049 Uploaded
waqas
parents:
diff changeset
398 else:
c9636a827049 Uploaded
waqas
parents:
diff changeset
399 TRcode = 'Multiallelic-t1'
c9636a827049 Uploaded
waqas
parents:
diff changeset
400 #-----------------------
c9636a827049 Uploaded
waqas
parents:
diff changeset
401 else:
c9636a827049 Uploaded
waqas
parents:
diff changeset
402 TRcode = '.'
c9636a827049 Uploaded
waqas
parents:
diff changeset
403
c9636a827049 Uploaded
waqas
parents:
diff changeset
404 if(TRcode != line22):
c9636a827049 Uploaded
waqas
parents:
diff changeset
405 protcode = self.translate_dna_single(TRcode)
c9636a827049 Uploaded
waqas
parents:
diff changeset
406 if len(protcode) == 0:
c9636a827049 Uploaded
waqas
parents:
diff changeset
407 protcode = "."; TRcode = "."
c9636a827049 Uploaded
waqas
parents:
diff changeset
408 else:
c9636a827049 Uploaded
waqas
parents:
diff changeset
409 TRcode = "."
c9636a827049 Uploaded
waqas
parents:
diff changeset
410
c9636a827049 Uploaded
waqas
parents:
diff changeset
411 if protcode == "" or TRcode == "":
c9636a827049 Uploaded
waqas
parents:
diff changeset
412 protcode = "."; TRcode = "."
c9636a827049 Uploaded
waqas
parents:
diff changeset
413 #-----------------------
c9636a827049 Uploaded
waqas
parents:
diff changeset
414 writ3.write(str(midline_crp + "\t" + TRcode + "\t" + protcode) + "\n")
c9636a827049 Uploaded
waqas
parents:
diff changeset
415
c9636a827049 Uploaded
waqas
parents:
diff changeset
416 ##----------------------
c9636a827049 Uploaded
waqas
parents:
diff changeset
417 except IndexError:
c9636a827049 Uploaded
waqas
parents:
diff changeset
418 if not midline_crp.split("\t")[0] == "CHROM":
c9636a827049 Uploaded
waqas
parents:
diff changeset
419 if len(protcode) ==0:
c9636a827049 Uploaded
waqas
parents:
diff changeset
420 protcode = "."; TRcode = "."
c9636a827049 Uploaded
waqas
parents:
diff changeset
421 writ3.write(str(midline_crp + "\t" + TRcode + "\t" + protcode) + "\n")
c9636a827049 Uploaded
waqas
parents:
diff changeset
422 else:
c9636a827049 Uploaded
waqas
parents:
diff changeset
423 protcode = "."; TRcode = "."
c9636a827049 Uploaded
waqas
parents:
diff changeset
424 writ3.write(str(midline_crp + "\t" + TRcode + "\t" + protcode) + "\n")
c9636a827049 Uploaded
waqas
parents:
diff changeset
425 pass
c9636a827049 Uploaded
waqas
parents:
diff changeset
426 return None
c9636a827049 Uploaded
waqas
parents:
diff changeset
427
c9636a827049 Uploaded
waqas
parents:
diff changeset
428 def PARS_OUT_VAR(self, workdir):
c9636a827049 Uploaded
waqas
parents:
diff changeset
429 """
c9636a827049 Uploaded
waqas
parents:
diff changeset
430 ###---Pars variants (2VAR _ 3VAR) based on change of protein codons ---
c9636a827049 Uploaded
waqas
parents:
diff changeset
431 ##----------------------------------
c9636a827049 Uploaded
waqas
parents:
diff changeset
432 """
c9636a827049 Uploaded
waqas
parents:
diff changeset
433 #This first records the ones that do match and saves the value in column 3 (less the final matching amino acid residue).
c9636a827049 Uploaded
waqas
parents:
diff changeset
434 subprocess.check_output("awk 'index($(NF-6), $(NF-2)) {print}' " + workdir + "temp_file3 | awk '{print $(NF-6)}' | sed s'/.$//' > " + workdir + "matches.list", shell=True)
c9636a827049 Uploaded
waqas
parents:
diff changeset
435
c9636a827049 Uploaded
waqas
parents:
diff changeset
436 #This then searches for the ones that do not match, and then also eliminates any value recorded in matches.list
c9636a827049 Uploaded
waqas
parents:
diff changeset
437 subprocess.check_output("awk '!index($(NF-6), $(NF-2)) { print }' " + workdir + "temp_file3 | grep -w -v -f " + workdir + "matches.list | awk -F'\t' '{ print }' > " + workdir + "temp_file4", shell=True)
c9636a827049 Uploaded
waqas
parents:
diff changeset
438
c9636a827049 Uploaded
waqas
parents:
diff changeset
439 return None
c9636a827049 Uploaded
waqas
parents:
diff changeset
440
c9636a827049 Uploaded
waqas
parents:
diff changeset
441 def ZYGO_PAIR(self, macaron_output, workdir, FIELDS):
c9636a827049 Uploaded
waqas
parents:
diff changeset
442 """
c9636a827049 Uploaded
waqas
parents:
diff changeset
443 ###---Pair of zygosity check: remove pair of codons for which one SNP is reference homozygous ---
c9636a827049 Uploaded
waqas
parents:
diff changeset
444 ##----------------------------------
c9636a827049 Uploaded
waqas
parents:
diff changeset
445 """
c9636a827049 Uploaded
waqas
parents:
diff changeset
446
c9636a827049 Uploaded
waqas
parents:
diff changeset
447 Fld_Len = int(len(FIELDS.split(",")))
c9636a827049 Uploaded
waqas
parents:
diff changeset
448 lines = open(workdir + "temp_file4", "r").read().splitlines()
c9636a827049 Uploaded
waqas
parents:
diff changeset
449 writ4 = self.write_file(macaron_output)
c9636a827049 Uploaded
waqas
parents:
diff changeset
450 #---
c9636a827049 Uploaded
waqas
parents:
diff changeset
451 midline_head = lines[0].strip()
c9636a827049 Uploaded
waqas
parents:
diff changeset
452 writ4.write(str(midline_head) + "\n")
c9636a827049 Uploaded
waqas
parents:
diff changeset
453 #---
c9636a827049 Uploaded
waqas
parents:
diff changeset
454 nextline_pos_lst = []
c9636a827049 Uploaded
waqas
parents:
diff changeset
455
c9636a827049 Uploaded
waqas
parents:
diff changeset
456 for i in range(len(lines)):
c9636a827049 Uploaded
waqas
parents:
diff changeset
457 #---
c9636a827049 Uploaded
waqas
parents:
diff changeset
458 midline_crp = lines[i].strip()
c9636a827049 Uploaded
waqas
parents:
diff changeset
459 #---
c9636a827049 Uploaded
waqas
parents:
diff changeset
460 beforeline = lines[i-1].strip()
c9636a827049 Uploaded
waqas
parents:
diff changeset
461 beforeline_pos = re.findall("\d+", beforeline.split("\t")[-7])
c9636a827049 Uploaded
waqas
parents:
diff changeset
462 midline = lines[i].strip()
c9636a827049 Uploaded
waqas
parents:
diff changeset
463 midline_pos = re.findall("\d+", midline.split("\t")[-7])
c9636a827049 Uploaded
waqas
parents:
diff changeset
464 try:
c9636a827049 Uploaded
waqas
parents:
diff changeset
465 nextline = lines[i+1].strip()
c9636a827049 Uploaded
waqas
parents:
diff changeset
466 nextline_pos =re.findall("\d+", nextline.split("\t")[-7])
c9636a827049 Uploaded
waqas
parents:
diff changeset
467 ##-------
c9636a827049 Uploaded
waqas
parents:
diff changeset
468 if (midline_pos[0] == nextline_pos[0]):
c9636a827049 Uploaded
waqas
parents:
diff changeset
469 Start_lns = int(6) + Fld_Len
c9636a827049 Uploaded
waqas
parents:
diff changeset
470 End_lns = int(8)
c9636a827049 Uploaded
waqas
parents:
diff changeset
471 Start_zyg_lns = int(len(midline.split("\t")) - int(Start_lns))
c9636a827049 Uploaded
waqas
parents:
diff changeset
472 midline_zyg = midline.split("\t")[-int(Start_zyg_lns):-int(End_lns)]
c9636a827049 Uploaded
waqas
parents:
diff changeset
473 nextline_zyg = nextline.split("\t")[-int(Start_zyg_lns):-int(End_lns)]
c9636a827049 Uploaded
waqas
parents:
diff changeset
474 checkList = list(['0/1:1/0', '1/0:0/1', '1/0:1/0', '0/1:0/1', '1/0:1/1', '1/1:1/0', '0/1:1/1', '1/1:0/1', '1/1:1/1'])
c9636a827049 Uploaded
waqas
parents:
diff changeset
475 Mergetwozyg = ','.join([str(a) + ":" + b for a,b in zip(midline_zyg, nextline_zyg)])
c9636a827049 Uploaded
waqas
parents:
diff changeset
476 Mergetwozygsp = Mergetwozyg.split(",")
c9636a827049 Uploaded
waqas
parents:
diff changeset
477 if set(Mergetwozygsp).intersection(checkList) != set([]):
c9636a827049 Uploaded
waqas
parents:
diff changeset
478 if ((midline_pos[0] == nextline_pos[0]) and (midline_pos[0] == beforeline_pos[0])):
c9636a827049 Uploaded
waqas
parents:
diff changeset
479 writ4.write(str(nextline)+"\n")
c9636a827049 Uploaded
waqas
parents:
diff changeset
480 else:
c9636a827049 Uploaded
waqas
parents:
diff changeset
481 writ4.write(str(midline+"\n"+nextline)+"\n")
c9636a827049 Uploaded
waqas
parents:
diff changeset
482 ##-------
c9636a827049 Uploaded
waqas
parents:
diff changeset
483 except IndexError:
c9636a827049 Uploaded
waqas
parents:
diff changeset
484 #Condition to remove the duplicated line at the end and prints only paired lines.
c9636a827049 Uploaded
waqas
parents:
diff changeset
485 nextline_pos_lst.append(nextline_pos[0])
c9636a827049 Uploaded
waqas
parents:
diff changeset
486 result = dict((i, nextline_pos_lst.count(i)) for i in nextline_pos_lst)
c9636a827049 Uploaded
waqas
parents:
diff changeset
487 if len(result) == 1:
c9636a827049 Uploaded
waqas
parents:
diff changeset
488 writ4.write(str(nextline) + "\n")
c9636a827049 Uploaded
waqas
parents:
diff changeset
489
c9636a827049 Uploaded
waqas
parents:
diff changeset
490 return None
c9636a827049 Uploaded
waqas
parents:
diff changeset
491 ###------
c9636a827049 Uploaded
waqas
parents:
diff changeset
492
c9636a827049 Uploaded
waqas
parents:
diff changeset
493
c9636a827049 Uploaded
waqas
parents:
diff changeset
494 ## MACARON MAIN
c9636a827049 Uploaded
waqas
parents:
diff changeset
495 if __name__ == "__main__":
c9636a827049 Uploaded
waqas
parents:
diff changeset
496 ## Parsing arguments
c9636a827049 Uploaded
waqas
parents:
diff changeset
497 parser = ArgumentParser(description="-Script to identify SnpClusters (SNPs within the same genetic codon)")
c9636a827049 Uploaded
waqas
parents:
diff changeset
498 parser.add_argument("-i", "--infile", dest="INPUTFile",default=False, required=True, help="Full path of the input VCF file.")
c9636a827049 Uploaded
waqas
parents:
diff changeset
499 parser.add_argument("-o", "--outfile", dest="OUTPUTFile",default="./MACARON_output.txt", required=False, help="Path of the output txt file (Default Output file: MACARON_output.txt)")
c9636a827049 Uploaded
waqas
parents:
diff changeset
500 parser.add_argument("-f", "--fields", dest="Fields", default="QUAL", required=False, help=" Single field name or comma-seperated ',' multiple field names can be given. Field name should be given according to the (INFO) field header of the input vcf file. Example: -f Func.refGene,ExonicFunc.refGene,Gene.refGene,1000g2015aug_all,ExAC_ALL,ExAC_EAS,clinvar_20161128,gnomAD_exome_ALL,gnomAD_genome_ALL,EFF,CSQ")
c9636a827049 Uploaded
waqas
parents:
diff changeset
501 parser.add_argument("--GATK", dest="GATK_path", default=GATK, required=False, help="Indicate the full path to GATK jar file")
c9636a827049 Uploaded
waqas
parents:
diff changeset
502 parser.add_argument("--HG_REF", dest="HG_REF_path", default=HG_REF, required=False, help="Indicate the full path to the reference genome fasta file")
c9636a827049 Uploaded
waqas
parents:
diff changeset
503 parser.add_argument("--SNPEFF", dest="SNPEFF_path", default=SNPEFF, required=False, help="Indicate the full path to SnpEff jar file")
c9636a827049 Uploaded
waqas
parents:
diff changeset
504 parser.add_argument("--SNPEFF_HG", dest="SNPEFF_HG_version", default=SNPEFF_HG, required=False, help="Indicate SnpEff human genome annotation database version")
c9636a827049 Uploaded
waqas
parents:
diff changeset
505 parser.add_argument("--gatk4", dest="GATK4", default=False, required=False, action='store_true', help="Add this option when using GATK versions >= 4.0")
c9636a827049 Uploaded
waqas
parents:
diff changeset
506 parser.add_argument("-v", "--verbosity", dest="Verbosity", default=False, required=False, action='store_true', help="Use to print verbosity (Mostly GATK/SNPEFF output)")
c9636a827049 Uploaded
waqas
parents:
diff changeset
507 parser.add_argument("-c", "--eco_friendly", dest="ECO", default=False, required=False, action='store_true', help="Save a thread, but you won't be able to stare at the fabulous animation while waiting ...")
c9636a827049 Uploaded
waqas
parents:
diff changeset
508
c9636a827049 Uploaded
waqas
parents:
diff changeset
509 ## Assign arguments to global variables
c9636a827049 Uploaded
waqas
parents:
diff changeset
510 args = parser.parse_args()
c9636a827049 Uploaded
waqas
parents:
diff changeset
511 FIELDS = args.Fields
c9636a827049 Uploaded
waqas
parents:
diff changeset
512 GATK4 = args.GATK4
c9636a827049 Uploaded
waqas
parents:
diff changeset
513 VERBOSE = args.Verbosity
c9636a827049 Uploaded
waqas
parents:
diff changeset
514 GATK = args.GATK_path
c9636a827049 Uploaded
waqas
parents:
diff changeset
515 HG_REF = args.HG_REF_path
c9636a827049 Uploaded
waqas
parents:
diff changeset
516 SNPEFF = args.SNPEFF_path
c9636a827049 Uploaded
waqas
parents:
diff changeset
517 SNPEFF_HG = args.SNPEFF_HG_version
c9636a827049 Uploaded
waqas
parents:
diff changeset
518 ECO = args.ECO
c9636a827049 Uploaded
waqas
parents:
diff changeset
519
c9636a827049 Uploaded
waqas
parents:
diff changeset
520 ## Inputs / Outputs path & names
c9636a827049 Uploaded
waqas
parents:
diff changeset
521 INF = args.INPUTFile
c9636a827049 Uploaded
waqas
parents:
diff changeset
522 OUTF = args.OUTPUTFile
c9636a827049 Uploaded
waqas
parents:
diff changeset
523 TMPDIR = os.path.dirname(os.path.abspath(OUTF)) + "/macaron_tmp/"
c9636a827049 Uploaded
waqas
parents:
diff changeset
524 subprocess.check_output("mkdir -p " + TMPDIR, shell=True)
c9636a827049 Uploaded
waqas
parents:
diff changeset
525
c9636a827049 Uploaded
waqas
parents:
diff changeset
526 ########################
c9636a827049 Uploaded
waqas
parents:
diff changeset
527 ## MAIN PROCESS ##
c9636a827049 Uploaded
waqas
parents:
diff changeset
528 ########################
c9636a827049 Uploaded
waqas
parents:
diff changeset
529 print(header)
c9636a827049 Uploaded
waqas
parents:
diff changeset
530
c9636a827049 Uploaded
waqas
parents:
diff changeset
531 ## Check if global variables point to existing files:
c9636a827049 Uploaded
waqas
parents:
diff changeset
532 INF_check = os.path.exists(INF)
c9636a827049 Uploaded
waqas
parents:
diff changeset
533 GATK_check = os.path.exists(GATK)
c9636a827049 Uploaded
waqas
parents:
diff changeset
534 HG_REF_check = os.path.exists(HG_REF)
c9636a827049 Uploaded
waqas
parents:
diff changeset
535 SNPEFF_check = os.path.exists(SNPEFF)
c9636a827049 Uploaded
waqas
parents:
diff changeset
536 SNPEFF_HG_non_empty = (SNPEFF_HG != "")
c9636a827049 Uploaded
waqas
parents:
diff changeset
537 if not(INF_check and GATK_check and HG_REF_check and SNPEFF_check and SNPEFF_HG_non_empty):
c9636a827049 Uploaded
waqas
parents:
diff changeset
538 print(">ERROR : One or several global variable: \n VCF={} > {}\n GATK={} > {}\n HG_REF={} > {}\n SNPEFF={} > {}\n SNPEFF_HG={} > {}\n>Please correct and try again!".format(INF, INF_check, GATK, GATK_check, HG_REF, HG_REF_check, SNPEFF, SNPEFF_check, SNPEFF_HG, SNPEFF_HG_non_empty))
c9636a827049 Uploaded
waqas
parents:
diff changeset
539 sys.exit(1)
c9636a827049 Uploaded
waqas
parents:
diff changeset
540 else: ## If everything checks out, start MACARON
c9636a827049 Uploaded
waqas
parents:
diff changeset
541 ## Animation event initialization
c9636a827049 Uploaded
waqas
parents:
diff changeset
542 keep_anim = multiprocessing.Event()
c9636a827049 Uploaded
waqas
parents:
diff changeset
543
c9636a827049 Uploaded
waqas
parents:
diff changeset
544 ## 1)VARIANTS FILTERING, ANNOTATION (GATK,SNPEff)
c9636a827049 Uploaded
waqas
parents:
diff changeset
545 clF1 = SearchDB().Search_CODON(INF, TMPDIR, FIELDS)
c9636a827049 Uploaded
waqas
parents:
diff changeset
546 ## 2)SEARCH MULTI-SNPS CODONS
c9636a827049 Uploaded
waqas
parents:
diff changeset
547 thread = print_step(keep_anim, 6)
c9636a827049 Uploaded
waqas
parents:
diff changeset
548 clF2 = SearchDB().TWO_VAR(TMPDIR)
c9636a827049 Uploaded
waqas
parents:
diff changeset
549 clF3 = SearchDB().THREE_VAR(TMPDIR)
c9636a827049 Uploaded
waqas
parents:
diff changeset
550 end_print_step(keep_anim, thread, 6)
c9636a827049 Uploaded
waqas
parents:
diff changeset
551 ## 3)CHECK IF SNPCLUSTERS IMPACT CODON
c9636a827049 Uploaded
waqas
parents:
diff changeset
552 thread = print_step(keep_anim, 7)
c9636a827049 Uploaded
waqas
parents:
diff changeset
553 clF4 = SearchDB().PARS_OUT_VAR(TMPDIR)
c9636a827049 Uploaded
waqas
parents:
diff changeset
554 end_print_step(keep_anim, thread, 7)
c9636a827049 Uploaded
waqas
parents:
diff changeset
555 ## 4) EXTRACT SNPCLUSTERS (Keeping SnpCluster if >=1 sample is Ref-Heterozygous or nonRef-Homozygous)
c9636a827049 Uploaded
waqas
parents:
diff changeset
556 thread = print_step(keep_anim, 8)
c9636a827049 Uploaded
waqas
parents:
diff changeset
557 clF5 = SearchDB().ZYGO_PAIR(OUTF, TMPDIR, FIELDS)
c9636a827049 Uploaded
waqas
parents:
diff changeset
558 end_print_step(keep_anim, thread, 8)
c9636a827049 Uploaded
waqas
parents:
diff changeset
559 print(footer)
c9636a827049 Uploaded
waqas
parents:
diff changeset
560 subprocess.check_output("rm -r " + TMPDIR, shell=True)