Mercurial > repos > waqas > macaron
comparison MACARON-GenMed-LabEx/MACARON @ 0:c9636a827049 draft default tip
Uploaded
| author | waqas |
|---|---|
| date | Wed, 12 Sep 2018 08:45:03 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c9636a827049 |
|---|---|
| 1 #!/usr/bin/python | |
| 2 | |
| 3 #alpha version 0.7 (05/09/18) | |
| 4 | |
| 5 """ | |
| 6 #Script to identify SNPs located within a genetic codon: | |
| 7 | |
| 8 Annotation of SNPs (including synonymous and non-synonymous variants) located within the same genetic codon requires attention. This idea conflicts with the annotations observed by traditional annotation software widely used now a days. While looking at the combined effect within the framework of genetic codon, we have new / altered codons that code for new amino acid that can be predicted by using this MACARON python script. | |
| 9 | |
| 10 #####------How to run quickly run MACARON------- | |
| 11 | |
| 12 # python MACARON -i yourinputfile.vcf | |
| 13 | |
| 14 # python MACARON -i /path/to/yourinputfile.vcf -o /path/to/MACARON_output.txt -f (INFO)_FIELD_HEADER --GATK /path/to/GenomeAnalysisTK.jar --HG_REF /path/to/hg.fasta --SNPEFF /path/to/snpeff.jar --SNPEFF_HG hg19 | |
| 15 | |
| 16 # python MACARON -i yourinputfile.vcf -o MACARON_output.txt --gatk4 (when using gatk versions >= 4.0) | |
| 17 | |
| 18 """ | |
| 19 | |
| 20 import sys, os, time | |
| 21 import itertools | |
| 22 import multiprocessing | |
| 23 import re | |
| 24 import subprocess | |
| 25 from argparse import ArgumentParser | |
| 26 | |
| 27 ## GLOBAL VARIABLES (IMPORTANT: You can set the default values here) | |
| 28 GATK="/home/wuk/software/GenomeAnalysisTK.jar" | |
| 29 #GATK="/home/wuk/software/gatk-4.0.1.2/gatk-package-4.0.1.2-local.jar" | |
| 30 HG_REF="/home/wuk/Working/gnme_refrnces/Homo_sapiens_assembly19.fasta" | |
| 31 SNPEFF="/home/wuk/software/snpEff/snpEff.jar" | |
| 32 SNPEFF_HG="GRCh37.75" ## SnpEff human genome annotation database version | |
| 33 | |
| 34 ## PRINTINGS, AESTHETICS | |
| 35 str_progress_list = ["\tIndexing VCF file", | |
| 36 "\tIdentifying SnpClusters", | |
| 37 "\tExtracting SnpClusters", | |
| 38 "\tAnnotating SnpClusters", | |
| 39 "\tExcluding InDels", | |
| 40 "\tGenerating a SnpCluster Table", | |
| 41 "\tRe-annotating Codons", | |
| 42 "\tRemoving SnpCluster if AA_Change_pcSNV == (AA1 or AA2)", | |
| 43 "\tExtracting established SnpClusters - for which two (or three) SNPs are reference heterozygous (or non-reference homozygous)"] | |
| 44 header = ("\n" + | |
| 45 "(###############)\n" + | |
| 46 "@@@@ MACARON @@@@\n" + | |
| 47 "(###############)\n\n" + | |
| 48 "Starting....\n") | |
| 49 footer = "\nMACARON Run Completed. Bon Courage with Analysis ...,,,!!!!\n" | |
| 50 | |
| 51 def animate(keep_anim, idx): | |
| 52 c = "|/-\\" | |
| 53 i=0 | |
| 54 while keep_anim.is_set(): | |
| 55 i += 1 | |
| 56 sys.stdout.write("\r>" + str_progress_list[idx] + ": " + c[i % len(c)] +"\r") | |
| 57 sys.stdout.flush() | |
| 58 time.sleep(0.1) | |
| 59 sys.stdout.write("\r " + str_progress_list[idx] + ": Done!\n") | |
| 60 def print_step(keep_anim, idx): | |
| 61 if not(ECO): | |
| 62 keep_anim.set() | |
| 63 anim_thread = multiprocessing.Process(target=animate, args=(keep_anim, idx)) | |
| 64 anim_thread.start() | |
| 65 return anim_thread | |
| 66 else: | |
| 67 sys.stdout.write(">" + str_progress_list[idx] + ": in progress...\r" ) | |
| 68 sys.stdout.flush() | |
| 69 return None | |
| 70 def end_print_step(keep_anim, anim_thread, idx): | |
| 71 if not(ECO): | |
| 72 keep_anim.clear(); anim_thread.join() | |
| 73 else: | |
| 74 sys.stdout.write(" " + str_progress_list[idx] + ": Done! \n") | |
| 75 sys.stdout.flush() | |
| 76 | |
| 77 | |
| 78 ## CLASS DEFINITIONS | |
| 79 | |
| 80 class fileHandler: | |
| 81 def __init__(self): | |
| 82 self.data = []; | |
| 83 def open_file(self, readfl): | |
| 84 self.rfile = open(readfl, 'r').readlines() | |
| 85 return self.rfile | |
| 86 def write_file(self, writefl): | |
| 87 self.wfile = open(writefl, 'w') | |
| 88 return self.wfile | |
| 89 | |
| 90 class SearchDB(fileHandler): | |
| 91 def __init__(self): | |
| 92 self.data = [] | |
| 93 from collections import defaultdict | |
| 94 self.ident_ranges_HMBM = defaultdict(list) | |
| 95 def Search_CODON(self, vcf_input, workdir, FIELDS): | |
| 96 """ | |
| 97 Calling SNPClusters | |
| 98 | |
| 99 USAGE INSTRUCTIONS: Full path to the software directories should be set before compiling. | |
| 100 """ | |
| 101 | |
| 102 ####################### | |
| 103 Fld_Len = int(len(FIELDS.split(","))) | |
| 104 FldwithF = " ".join(["-F " + str(x) for x in FIELDS.split(",")]) | |
| 105 | |
| 106 ## Options compatible with GATK versions >= 4.0: add option --gatk4 when calling MACARON | |
| 107 if GATK4: | |
| 108 GATK_v, SNPeff_v = (" ", " -v ") if VERBOSE else (" --QUIET true --verbosity ERROR ", " ") | |
| 109 cmd0 = "java -Xmx12g -jar " + GATK + " IndexFeatureFile -F " + vcf_input + GATK_v | |
| 110 cmd1 = "java -Xmx12g -jar " + GATK + " VariantFiltration -R " + HG_REF + " -V "+ vcf_input +" -O " + workdir + "snp_clsters2_ws3.vcf --cluster-size 2 --cluster-window-size 3" + GATK_v | |
| 111 cmd2 = "java -Xmx12g -jar " + GATK + " SelectVariants -R " + HG_REF + " -V " + workdir + "snp_clsters2_ws3.vcf -O " + workdir + "snp_clsters2_ws3_clstronly.vcf -select 'FILTER == SnpCluster'" + GATK_v | |
| 112 cmd3 = "java -Xmx12g -jar " + SNPEFF + SNPeff_v + SNPEFF_HG + " -formatEff -lof -classic " + workdir + "snp_clsters2_ws3_clstronly.vcf > " + workdir + "snp_clsters2_ws3_clstronly_annt.vcf" | |
| 113 cmd4 = "java -Xmx12g -jar " + GATK + " SelectVariants -R " + HG_REF + " -V " + workdir + "snp_clsters2_ws3_clstronly_annt.vcf -O " + workdir + "snp_clsters2_ws3_clstronly_annt_snv.vcf --select-type-to-include SNP" + GATK_v | |
| 114 cmd5 = "java -Xmx12g -jar " + GATK + " VariantsToTable -R " + HG_REF + " -V " + workdir + "snp_clsters2_ws3_clstronly_annt_snv.vcf -F CHROM -F POS -F ID -F REF -F ALT -F EFF " + FldwithF + " -GF GT --error-if-missing-data --show-filtered -O " + workdir + "snp_clsters2_ws3_clstronly_annt_snv_clstronly.table" + GATK_v | |
| 115 ## GATK4 needs to index the VCF upfront | |
| 116 thread = print_step(keep_anim, 0) | |
| 117 subprocess.check_output(cmd0, shell=True) | |
| 118 end_print_step(keep_anim, thread, 0) | |
| 119 | |
| 120 else: ## Options comptatible with GATK versions < 4 | |
| 121 GATK_v, SNPeff_v = (" ", " -v ") if VERBOSE else (" --logging_level ERROR ", " ") | |
| 122 cmd1 = "java -Xmx4g -jar " + GATK + " -T VariantFiltration -R " + HG_REF + " -V " + vcf_input + " -o " + workdir + "snp_clsters2_ws3.vcf --clusterSize 2 --clusterWindowSize 3" + GATK_v | |
| 123 cmd2 = "java -Xmx4g -jar " + GATK + " -T SelectVariants -R " + HG_REF + " -V " + workdir + "snp_clsters2_ws3.vcf -o " + workdir + "snp_clsters2_ws3_clstronly.vcf -select 'FILTER == SnpCluster'" + GATK_v | |
| 124 cmd3 = "java -Xmx4g -jar " + SNPEFF + SNPeff_v + SNPEFF_HG + " -formatEff -lof -classic " + workdir + "snp_clsters2_ws3_clstronly.vcf > " + workdir + "snp_clsters2_ws3_clstronly_annt.vcf" | |
| 125 cmd4 = "java -Xmx4g -jar " + GATK + " -T SelectVariants -R " + HG_REF + " -V " + workdir + "snp_clsters2_ws3_clstronly_annt.vcf -o " + workdir + "snp_clsters2_ws3_clstronly_annt_snv.vcf --selectTypeToInclude SNP" + GATK_v | |
| 126 cmd5 = "java -Xmx4g -jar " + GATK + " -T VariantsToTable -R " + HG_REF + " -V " + workdir + "snp_clsters2_ws3_clstronly_annt_snv.vcf -F CHROM -F POS -F ID -F REF -F ALT -F EFF " + FldwithF + " -GF GT --showFiltered -o " + workdir + "snp_clsters2_ws3_clstronly_annt_snv_clstronly.table" + GATK_v | |
| 127 | |
| 128 thread = print_step(keep_anim, 1) | |
| 129 subprocess.check_output(cmd1, shell=True) | |
| 130 end_print_step(keep_anim, thread, 1) | |
| 131 thread = print_step(keep_anim, 2) | |
| 132 subprocess.check_output(cmd2, shell=True) | |
| 133 end_print_step(keep_anim, thread, 2) | |
| 134 thread = print_step(keep_anim, 3) | |
| 135 subprocess.check_output(cmd3, shell=True) | |
| 136 end_print_step(keep_anim, thread, 3) | |
| 137 thread = print_step(keep_anim, 4) | |
| 138 subprocess.check_output(cmd4, shell=True) | |
| 139 end_print_step(keep_anim, thread, 4) | |
| 140 thread = print_step(keep_anim, 5) | |
| 141 subprocess.check_output(cmd5, shell=True) | |
| 142 end_print_step(keep_anim, thread, 5) | |
| 143 | |
| 144 subprocess.check_output("rm snpEff_genes.txt", shell=True) | |
| 145 subprocess.check_output("rm snpEff_summary.html", shell=True) | |
| 146 | |
| 147 ####-------------------------------------- | |
| 148 | |
| 149 def Change_zygo(ref, alt, zyg): | |
| 150 """ | |
| 151 program to convert zygosity code ref/alt to 0/1. | |
| 152 Input Variables: | |
| 153 | |
| 154 ref = "A";alt = "G" | |
| 155 zyg = ['A/G', 'G/A', 'G/G', 'A/A', './.', 'G/.', './G', './A', 'A/.'] | |
| 156 chzyg = ['0/1', '1/0', '1/1', '0/0', './.', '1/.', './1', './0', '0/.'] | |
| 157 InputUsage Variables: | |
| 158 chgz = Change_zygo(ref, alt, zyg) | |
| 159 | |
| 160 """ | |
| 161 import re | |
| 162 chg_zyg = {}; i = 1 | |
| 163 for cs in zyg: | |
| 164 csp = re.split('[|/]', cs) | |
| 165 if ((ref == csp[0]) and (ref == csp[1]) and ((csp[0] != ".") and (csp[1] != "."))): | |
| 166 chg_zyg[i] = "0/0" | |
| 167 elif ((ref != csp[0]) and (ref == csp[1]) and ((csp[0] != ".") and (csp[1] != "."))): | |
| 168 chg_zyg[i] = "1/0" | |
| 169 elif ((ref == csp[0]) and (ref != csp[1]) and ((csp[0] != ".") and (csp[1] != "."))): | |
| 170 chg_zyg[i] = "0/1" | |
| 171 elif ((ref != csp[0]) and (ref != csp[1]) and ((csp[0] != ".") and (csp[1] != "."))): | |
| 172 chg_zyg[i] = "1/1" | |
| 173 elif ((csp[0] == ".") and (csp[1] == ".")): | |
| 174 chg_zyg[i] = cs | |
| 175 elif ((csp[1] == ".")): | |
| 176 if (ref == csp[0]): | |
| 177 chg_zyg[i] = "0/." | |
| 178 elif (ref != csp[0]): | |
| 179 chg_zyg[i] = "1/." | |
| 180 elif ((csp[0] == ".")): | |
| 181 if (ref == csp[1]): | |
| 182 chg_zyg[i] = "./0" | |
| 183 elif (ref != csp[1]): | |
| 184 chg_zyg[i] = "./1" | |
| 185 i += 1 | |
| 186 return list(chg_zyg.values()) | |
| 187 ####-------------------------------------- | |
| 188 | |
| 189 with open(workdir + "snp_clsters2_ws3_clstronly_annt_snv_clstronly.table", 'r') as f1, open(workdir + "temp_file1", 'w') as output: | |
| 190 first_line = f1.readline().strip() | |
| 191 zyg_head = '\t'.join(first_line.split()[6+Fld_Len:]) | |
| 192 output.write(first_line + "\t" + zyg_head + "\t" + str("Protein_coding_EFF AA-Change REF-codon ALT-codon") + "\n") | |
| 193 for line in f1: | |
| 194 line1 = line.strip() | |
| 195 line_TAB = line1.split("\t") | |
| 196 line_EFF = line_TAB[5].split("|") | |
| 197 if (len(line_EFF) > 1): | |
| 198 if ((line_EFF[1] == "SILENT") or (line_EFF[1] == "MISSENSE") or (line_EFF[1] == "NONSENSE")): | |
| 199 True | |
| 200 linesp = line_EFF[2].split("/") | |
| 201 ref = line_TAB[3] | |
| 202 alt = line_TAB[4] | |
| 203 zyg = line_TAB[6+Fld_Len:] | |
| 204 chgz = Change_zygo(ref, alt, zyg) | |
| 205 chgz_out = '\t'.join(chgz) | |
| 206 wrt = str(line_EFF[1] + "\t" + line_EFF[3] + "\t" + linesp[0] + "\t" + linesp[1]) | |
| 207 output.write(line1 + "\t" + chgz_out + "\t" + wrt + "\n") | |
| 208 return None | |
| 209 | |
| 210 ##------------------------------- | |
| 211 import string | |
| 212 gencode = {'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T','AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R','CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P','CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R','GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A','GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G','TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L','TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W'} | |
| 213 | |
| 214 # a function to translate a single codon | |
| 215 def translate_codon(self, codon): | |
| 216 return self.gencode.get(codon.upper(), '#') | |
| 217 # a function to split a sequence into codons | |
| 218 def split_into_codons(self, dna, frame): | |
| 219 codons = [] | |
| 220 for i in range(frame-1, len(dna)-2, 3): | |
| 221 codon = dna[i:i+3] | |
| 222 codons.append(codon) | |
| 223 return codons | |
| 224 # a function to translate a dna sequence in a single frame | |
| 225 def translate_dna_single(self, dna, frame=1): | |
| 226 codons = self.split_into_codons(dna, frame) | |
| 227 amino_acids = '' | |
| 228 for codon in codons: | |
| 229 amino_acids = amino_acids + self.translate_codon(codon) | |
| 230 return amino_acids | |
| 231 | |
| 232 def TWO_VAR(self, workdir): | |
| 233 """ | |
| 234 ###---Two variants (2VAR) codon changes--- | |
| 235 ##---------------------------------- | |
| 236 """ | |
| 237 lines = open(workdir + "temp_file1", "r").read().splitlines() | |
| 238 writ2 = self.write_file(workdir + "temp_file2") | |
| 239 #--- | |
| 240 midline_head = lines[0].strip().split("\t") | |
| 241 try: | |
| 242 midline_headcrp = '\t'.join([w.replace(midline_head[5], 'Gene_Name') for w in midline_head]) | |
| 243 except ValueError: | |
| 244 pass | |
| 245 writ2.write(str(midline_headcrp + "\t" + "ALT-codon_merge-2VAR" + "\t" + "AA-Change-2VAR") + "\n") | |
| 246 #--- | |
| 247 | |
| 248 i=0; TRcode =""; protcode = ""; midline_crpit = "" | |
| 249 for i in range(len(lines)): | |
| 250 #--- | |
| 251 midline_crp = lines[i].strip().split("\t") | |
| 252 try: | |
| 253 if len(midline_crp[5].split("|")) != 1: | |
| 254 GeneName = midline_crp[5].split("|")[5] | |
| 255 midline_crpit = '\t'.join([w.replace(midline_crp[5], GeneName) for w in midline_crp]) | |
| 256 except ValueError: | |
| 257 pass | |
| 258 #--- | |
| 259 try: | |
| 260 beforeline = lines[i-1].strip() | |
| 261 line0 = beforeline.split("\t")[-3] | |
| 262 beforeline1 = re.findall("\d+", line0) | |
| 263 | |
| 264 midline = lines[i].strip() | |
| 265 line1 = midline.split("\t")[-3] | |
| 266 midline1 = re.findall("\d+", line1) | |
| 267 | |
| 268 nextline = lines[i+1].strip() | |
| 269 line2 = nextline.split("\t")[-3] | |
| 270 nextline1 = re.findall("\d+", line2) | |
| 271 #----Condition to replace empty list ([] to ['000']) produced from the header column "AA-Change". | |
| 272 if (line0 == "AA-Change"): | |
| 273 beforeline1.append('000') | |
| 274 elif (line1 == "AA-Change"): | |
| 275 midline1.append('000') | |
| 276 elif (line2 == "AA-Change"): | |
| 277 nextline1.append('000') | |
| 278 #---- | |
| 279 REFbf=[]; line22="" | |
| 280 if ((beforeline1[0] == midline1[0]) or (midline1[0] == nextline1[0])): | |
| 281 spREFbf=[]; lscod1=[]; lscod2=[] | |
| 282 REF= lines[i].strip().split("\t")[-2] | |
| 283 if (midline1[0] == beforeline1[0]): | |
| 284 REFbf = lines[i-1].strip().split("\t")[-2] | |
| 285 | |
| 286 line11 = lines[i].strip().split("\t")[-1] | |
| 287 if (midline1[0] == nextline1[0]): | |
| 288 line22 = lines[i+1].strip().split("\t")[-1] | |
| 289 if REFbf != []: | |
| 290 for cod in REFbf: | |
| 291 spREFbf.append(cod) | |
| 292 for cod in line11: | |
| 293 lscod1.append(cod) | |
| 294 for cod in line22: | |
| 295 lscod2.append(cod) | |
| 296 | |
| 297 if (((lscod1[0].islower()==True and lscod2[0].islower()==True) or (lscod1[1].islower()==True and lscod2[1].islower()==True) or (lscod1[2].islower()==True and lscod2[2].islower()==True)) and ((lscod1[0] == lscod2[0]) or (lscod1[1] == lscod2[1]) or (lscod1[2] == lscod2[2]))): | |
| 298 threeltr_code = [] | |
| 299 if ((lscod1[0].isupper()==True and lscod2[0].islower()==True) or (lscod1[0] == lscod2[0])): | |
| 300 threeltr_code.append(lscod1[0]) | |
| 301 elif((lscod1[0].islower()==True and lscod2[0].isupper()==True) or (lscod1[0] == lscod2[0])): | |
| 302 threeltr_code.append(lscod2[0]) | |
| 303 | |
| 304 if((lscod1[1].isupper()==True and lscod2[1].islower()==True) or (lscod1[1] == lscod2[1])): | |
| 305 threeltr_code.append(lscod1[1]) | |
| 306 elif((lscod1[1].islower()==True and lscod2[1].isupper()==True) or (lscod1[1] == lscod2[1])): | |
| 307 threeltr_code.append(lscod2[1]) | |
| 308 | |
| 309 if((lscod1[2].isupper()==True and lscod2[2].islower()==True) or (lscod1[2] == lscod2[2])): | |
| 310 threeltr_code.append(lscod1[2]) | |
| 311 elif((lscod1[2].islower()==True and lscod2[2].isupper()==True) or (lscod1[2] == lscod2[2])): | |
| 312 threeltr_code.append(lscod2[2]) | |
| 313 #----------------------- | |
| 314 if(len(threeltr_code)==3): | |
| 315 TRcode = ''.join(threeltr_code) | |
| 316 else: | |
| 317 TRcode = 'Multiallelic-t1' | |
| 318 #----------------------- | |
| 319 else: | |
| 320 TRcode = '...' | |
| 321 | |
| 322 if(TRcode != line22): | |
| 323 protcode = self.translate_dna_single(TRcode) | |
| 324 else: | |
| 325 TRcode = "." | |
| 326 | |
| 327 if (REF == REFbf): | |
| 328 protcode = "#####";TRcode = "Multiallelic-t2" | |
| 329 writ2.write(str(midline_crpit + "\t" + TRcode + "\t" + protcode) + "\n") | |
| 330 | |
| 331 except IndexError: | |
| 332 if not midline_crpit.split("\t")[0] == "CHROM": | |
| 333 writ2.write(str(midline_crpit + "\t" + "." + "\t" + protcode) + "\n") | |
| 334 pass | |
| 335 return None | |
| 336 | |
| 337 def THREE_VAR(self, workdir): | |
| 338 """ | |
| 339 ###---Three variants (3VAR) codon changes--- | |
| 340 ##---------------------------------- | |
| 341 """ | |
| 342 lines = open(workdir + "temp_file2", "r").read().splitlines() | |
| 343 writ3 = self.write_file(workdir + "temp_file3") | |
| 344 #--- | |
| 345 midline_head =lines[0].strip() | |
| 346 writ3.write(str(midline_head + "\t" + "ALT-codon_merge-3VAR" + "\t" + "AA-Change-3VAR") + "\n") | |
| 347 #--- | |
| 348 i = 0; TRcode = ""; protcode = "" | |
| 349 for i in range(len(lines)): | |
| 350 try: | |
| 351 #--- | |
| 352 midline_crp = lines[i].strip() | |
| 353 #--- | |
| 354 beforeline = lines[i-1].strip() | |
| 355 line0 = beforeline.split("\t")[-5] | |
| 356 beforeline1 = re.findall("\d+", line0) | |
| 357 midline = lines[i].strip() | |
| 358 line1 = midline.split("\t")[-5] | |
| 359 midline1 = re.findall("\d+", line1) | |
| 360 nextline = lines[i+1].strip() | |
| 361 line2 = nextline.split("\t")[-5] | |
| 362 nextline1 = re.findall("\d+", line2) | |
| 363 | |
| 364 line11=[]; line22=[]; writelst= [] | |
| 365 if ((beforeline1[0] == midline1[0]) or (midline1[0] == nextline1[0])): | |
| 366 lscod1=[]; lscod2=[] | |
| 367 line11 = lines[i].strip().split("\t")[-2] | |
| 368 if (midline1[0] == nextline1[0]): | |
| 369 line22 = lines[i+1].strip().split("\t")[-2] | |
| 370 for cod in line11: | |
| 371 lscod1.append(cod) | |
| 372 for cod in line22: | |
| 373 lscod2.append(cod) | |
| 374 #----------------------- | |
| 375 if(len(lscod1) == 3 and len(lscod2) == 3): | |
| 376 threeltr_code = []; | |
| 377 | |
| 378 if ((lscod1[0].isupper()==True) and (lscod2[0].islower()==True) or | |
| 379 (lscod1[0].isupper()==True and lscod2[0].isupper()==True)): | |
| 380 threeltr_code.append(lscod1[0]) | |
| 381 elif ((lscod1[0].islower()==True) and (lscod2[0].isupper()==True)): | |
| 382 threeltr_code.append(lscod2[0]) | |
| 383 | |
| 384 if ((lscod1[1].isupper()==True) and (lscod2[1].islower()==True) or | |
| 385 (lscod1[1].isupper()==True and lscod2[1].isupper()==True)): | |
| 386 threeltr_code.append(lscod1[1]) | |
| 387 elif ((lscod1[1].islower()==True) and (lscod2[1].isupper()==True)): | |
| 388 threeltr_code.append(lscod2[1]) | |
| 389 | |
| 390 if ((lscod1[2].isupper()==True) and (lscod2[2].islower()==True) or | |
| 391 (lscod1[2].isupper()==True and lscod2[2].isupper()==True)): | |
| 392 threeltr_code.append(lscod1[2]) | |
| 393 elif ((lscod1[2].islower()==True) and (lscod2[2].isupper()==True)): | |
| 394 threeltr_code.append(lscod2[2]) | |
| 395 #----------------------- | |
| 396 if(len(threeltr_code) == 3): | |
| 397 TRcode = ''.join(threeltr_code) | |
| 398 else: | |
| 399 TRcode = 'Multiallelic-t1' | |
| 400 #----------------------- | |
| 401 else: | |
| 402 TRcode = '.' | |
| 403 | |
| 404 if(TRcode != line22): | |
| 405 protcode = self.translate_dna_single(TRcode) | |
| 406 if len(protcode) == 0: | |
| 407 protcode = "."; TRcode = "." | |
| 408 else: | |
| 409 TRcode = "." | |
| 410 | |
| 411 if protcode == "" or TRcode == "": | |
| 412 protcode = "."; TRcode = "." | |
| 413 #----------------------- | |
| 414 writ3.write(str(midline_crp + "\t" + TRcode + "\t" + protcode) + "\n") | |
| 415 | |
| 416 ##---------------------- | |
| 417 except IndexError: | |
| 418 if not midline_crp.split("\t")[0] == "CHROM": | |
| 419 if len(protcode) ==0: | |
| 420 protcode = "."; TRcode = "." | |
| 421 writ3.write(str(midline_crp + "\t" + TRcode + "\t" + protcode) + "\n") | |
| 422 else: | |
| 423 protcode = "."; TRcode = "." | |
| 424 writ3.write(str(midline_crp + "\t" + TRcode + "\t" + protcode) + "\n") | |
| 425 pass | |
| 426 return None | |
| 427 | |
| 428 def PARS_OUT_VAR(self, workdir): | |
| 429 """ | |
| 430 ###---Pars variants (2VAR _ 3VAR) based on change of protein codons --- | |
| 431 ##---------------------------------- | |
| 432 """ | |
| 433 #This first records the ones that do match and saves the value in column 3 (less the final matching amino acid residue). | |
| 434 subprocess.check_output("awk 'index($(NF-6), $(NF-2)) {print}' " + workdir + "temp_file3 | awk '{print $(NF-6)}' | sed s'/.$//' > " + workdir + "matches.list", shell=True) | |
| 435 | |
| 436 #This then searches for the ones that do not match, and then also eliminates any value recorded in matches.list | |
| 437 subprocess.check_output("awk '!index($(NF-6), $(NF-2)) { print }' " + workdir + "temp_file3 | grep -w -v -f " + workdir + "matches.list | awk -F'\t' '{ print }' > " + workdir + "temp_file4", shell=True) | |
| 438 | |
| 439 return None | |
| 440 | |
| 441 def ZYGO_PAIR(self, macaron_output, workdir, FIELDS): | |
| 442 """ | |
| 443 ###---Pair of zygosity check: remove pair of codons for which one SNP is reference homozygous --- | |
| 444 ##---------------------------------- | |
| 445 """ | |
| 446 | |
| 447 Fld_Len = int(len(FIELDS.split(","))) | |
| 448 lines = open(workdir + "temp_file4", "r").read().splitlines() | |
| 449 writ4 = self.write_file(macaron_output) | |
| 450 #--- | |
| 451 midline_head = lines[0].strip() | |
| 452 writ4.write(str(midline_head) + "\n") | |
| 453 #--- | |
| 454 nextline_pos_lst = [] | |
| 455 | |
| 456 for i in range(len(lines)): | |
| 457 #--- | |
| 458 midline_crp = lines[i].strip() | |
| 459 #--- | |
| 460 beforeline = lines[i-1].strip() | |
| 461 beforeline_pos = re.findall("\d+", beforeline.split("\t")[-7]) | |
| 462 midline = lines[i].strip() | |
| 463 midline_pos = re.findall("\d+", midline.split("\t")[-7]) | |
| 464 try: | |
| 465 nextline = lines[i+1].strip() | |
| 466 nextline_pos =re.findall("\d+", nextline.split("\t")[-7]) | |
| 467 ##------- | |
| 468 if (midline_pos[0] == nextline_pos[0]): | |
| 469 Start_lns = int(6) + Fld_Len | |
| 470 End_lns = int(8) | |
| 471 Start_zyg_lns = int(len(midline.split("\t")) - int(Start_lns)) | |
| 472 midline_zyg = midline.split("\t")[-int(Start_zyg_lns):-int(End_lns)] | |
| 473 nextline_zyg = nextline.split("\t")[-int(Start_zyg_lns):-int(End_lns)] | |
| 474 checkList = list(['0/1:1/0', '1/0:0/1', '1/0:1/0', '0/1:0/1', '1/0:1/1', '1/1:1/0', '0/1:1/1', '1/1:0/1', '1/1:1/1']) | |
| 475 Mergetwozyg = ','.join([str(a) + ":" + b for a,b in zip(midline_zyg, nextline_zyg)]) | |
| 476 Mergetwozygsp = Mergetwozyg.split(",") | |
| 477 if set(Mergetwozygsp).intersection(checkList) != set([]): | |
| 478 if ((midline_pos[0] == nextline_pos[0]) and (midline_pos[0] == beforeline_pos[0])): | |
| 479 writ4.write(str(nextline)+"\n") | |
| 480 else: | |
| 481 writ4.write(str(midline+"\n"+nextline)+"\n") | |
| 482 ##------- | |
| 483 except IndexError: | |
| 484 #Condition to remove the duplicated line at the end and prints only paired lines. | |
| 485 nextline_pos_lst.append(nextline_pos[0]) | |
| 486 result = dict((i, nextline_pos_lst.count(i)) for i in nextline_pos_lst) | |
| 487 if len(result) == 1: | |
| 488 writ4.write(str(nextline) + "\n") | |
| 489 | |
| 490 return None | |
| 491 ###------ | |
| 492 | |
| 493 | |
| 494 ## MACARON MAIN | |
| 495 if __name__ == "__main__": | |
| 496 ## Parsing arguments | |
| 497 parser = ArgumentParser(description="-Script to identify SnpClusters (SNPs within the same genetic codon)") | |
| 498 parser.add_argument("-i", "--infile", dest="INPUTFile",default=False, required=True, help="Full path of the input VCF file.") | |
| 499 parser.add_argument("-o", "--outfile", dest="OUTPUTFile",default="./MACARON_output.txt", required=False, help="Path of the output txt file (Default Output file: MACARON_output.txt)") | |
| 500 parser.add_argument("-f", "--fields", dest="Fields", default="QUAL", required=False, help=" Single field name or comma-seperated ',' multiple field names can be given. Field name should be given according to the (INFO) field header of the input vcf file. Example: -f Func.refGene,ExonicFunc.refGene,Gene.refGene,1000g2015aug_all,ExAC_ALL,ExAC_EAS,clinvar_20161128,gnomAD_exome_ALL,gnomAD_genome_ALL,EFF,CSQ") | |
| 501 parser.add_argument("--GATK", dest="GATK_path", default=GATK, required=False, help="Indicate the full path to GATK jar file") | |
| 502 parser.add_argument("--HG_REF", dest="HG_REF_path", default=HG_REF, required=False, help="Indicate the full path to the reference genome fasta file") | |
| 503 parser.add_argument("--SNPEFF", dest="SNPEFF_path", default=SNPEFF, required=False, help="Indicate the full path to SnpEff jar file") | |
| 504 parser.add_argument("--SNPEFF_HG", dest="SNPEFF_HG_version", default=SNPEFF_HG, required=False, help="Indicate SnpEff human genome annotation database version") | |
| 505 parser.add_argument("--gatk4", dest="GATK4", default=False, required=False, action='store_true', help="Add this option when using GATK versions >= 4.0") | |
| 506 parser.add_argument("-v", "--verbosity", dest="Verbosity", default=False, required=False, action='store_true', help="Use to print verbosity (Mostly GATK/SNPEFF output)") | |
| 507 parser.add_argument("-c", "--eco_friendly", dest="ECO", default=False, required=False, action='store_true', help="Save a thread, but you won't be able to stare at the fabulous animation while waiting ...") | |
| 508 | |
| 509 ## Assign arguments to global variables | |
| 510 args = parser.parse_args() | |
| 511 FIELDS = args.Fields | |
| 512 GATK4 = args.GATK4 | |
| 513 VERBOSE = args.Verbosity | |
| 514 GATK = args.GATK_path | |
| 515 HG_REF = args.HG_REF_path | |
| 516 SNPEFF = args.SNPEFF_path | |
| 517 SNPEFF_HG = args.SNPEFF_HG_version | |
| 518 ECO = args.ECO | |
| 519 | |
| 520 ## Inputs / Outputs path & names | |
| 521 INF = args.INPUTFile | |
| 522 OUTF = args.OUTPUTFile | |
| 523 TMPDIR = os.path.dirname(os.path.abspath(OUTF)) + "/macaron_tmp/" | |
| 524 subprocess.check_output("mkdir -p " + TMPDIR, shell=True) | |
| 525 | |
| 526 ######################## | |
| 527 ## MAIN PROCESS ## | |
| 528 ######################## | |
| 529 print(header) | |
| 530 | |
| 531 ## Check if global variables point to existing files: | |
| 532 INF_check = os.path.exists(INF) | |
| 533 GATK_check = os.path.exists(GATK) | |
| 534 HG_REF_check = os.path.exists(HG_REF) | |
| 535 SNPEFF_check = os.path.exists(SNPEFF) | |
| 536 SNPEFF_HG_non_empty = (SNPEFF_HG != "") | |
| 537 if not(INF_check and GATK_check and HG_REF_check and SNPEFF_check and SNPEFF_HG_non_empty): | |
| 538 print(">ERROR : One or several global variable: \n VCF={} > {}\n GATK={} > {}\n HG_REF={} > {}\n SNPEFF={} > {}\n SNPEFF_HG={} > {}\n>Please correct and try again!".format(INF, INF_check, GATK, GATK_check, HG_REF, HG_REF_check, SNPEFF, SNPEFF_check, SNPEFF_HG, SNPEFF_HG_non_empty)) | |
| 539 sys.exit(1) | |
| 540 else: ## If everything checks out, start MACARON | |
| 541 ## Animation event initialization | |
| 542 keep_anim = multiprocessing.Event() | |
| 543 | |
| 544 ## 1)VARIANTS FILTERING, ANNOTATION (GATK,SNPEff) | |
| 545 clF1 = SearchDB().Search_CODON(INF, TMPDIR, FIELDS) | |
| 546 ## 2)SEARCH MULTI-SNPS CODONS | |
| 547 thread = print_step(keep_anim, 6) | |
| 548 clF2 = SearchDB().TWO_VAR(TMPDIR) | |
| 549 clF3 = SearchDB().THREE_VAR(TMPDIR) | |
| 550 end_print_step(keep_anim, thread, 6) | |
| 551 ## 3)CHECK IF SNPCLUSTERS IMPACT CODON | |
| 552 thread = print_step(keep_anim, 7) | |
| 553 clF4 = SearchDB().PARS_OUT_VAR(TMPDIR) | |
| 554 end_print_step(keep_anim, thread, 7) | |
| 555 ## 4) EXTRACT SNPCLUSTERS (Keeping SnpCluster if >=1 sample is Ref-Heterozygous or nonRef-Homozygous) | |
| 556 thread = print_step(keep_anim, 8) | |
| 557 clF5 = SearchDB().ZYGO_PAIR(OUTF, TMPDIR, FIELDS) | |
| 558 end_print_step(keep_anim, thread, 8) | |
| 559 print(footer) | |
| 560 subprocess.check_output("rm -r " + TMPDIR, shell=True) |
