view snpEff_cds_report.py @ 7:fbb6510186df default tip

when codon has ambiguous nucleotide translate to X
author Jim Johnson <jj@umn.edu>
date Mon, 15 Sep 2014 06:13:47 -0500
parents a64ef0611117
children
line wrap: on
line source

#!/usr/bin/python
## version 1.2
import sys,os,tempfile,re
import httplib, urllib
import optparse
#import vcfClass
#from vcfClass import *
#import tools
#from tools import *

debug = False
cds_anchor = 'cds_variation'
aa_anchor = 'aa_variation'

codon_map = {"UUU":"F", "UUC":"F", "UUA":"L", "UUG":"L",
    "UCU":"S", "UCC":"S", "UCA":"S", "UCG":"S",
    "UAU":"Y", "UAC":"Y", "UAA":"*", "UAG":"*",
    "UGU":"C", "UGC":"C", "UGA":"*", "UGG":"W",
    "CUU":"L", "CUC":"L", "CUA":"L", "CUG":"L",
    "CCU":"P", "CCC":"P", "CCA":"P", "CCG":"P",
    "CAU":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
    "CGU":"R", "CGC":"R", "CGA":"R", "CGG":"R",
    "AUU":"I", "AUC":"I", "AUA":"I", "AUG":"M",
    "ACU":"T", "ACC":"T", "ACA":"T", "ACG":"T",
    "AAU":"N", "AAC":"N", "AAA":"K", "AAG":"K",
    "AGU":"S", "AGC":"S", "AGA":"R", "AGG":"R",
    "GUU":"V", "GUC":"V", "GUA":"V", "GUG":"V",
    "GCU":"A", "GCC":"A", "GCA":"A", "GCG":"A",
    "GAU":"D", "GAC":"D", "GAA":"E", "GAG":"E",
    "GGU":"G", "GGC":"G", "GGA":"G", "GGG":"G",}

def reverseComplement(seq) :
  rev = seq[::-1].lower().replace('u','A').replace('t','A').replace('a','T').replace('c','G').replace('g','C').upper()
  return rev

def translate(seq) :
  rna = seq.upper().replace('T','U')
  aa = []
  for i in range(0,len(rna) - 2, 3):
    codon = rna[i:i+3]
    aa.append(codon_map[codon] if codon in codon_map else 'X')
  return ''.join(aa)

"""
SNfEffect vcf reported variations to the reference sequence, so need to reverse complement for coding sequences on the negative strand
Queries that request a sequence always return the sequence in the first column regardless of the order of attributes.
"""
query_ccds_template = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >
	<Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" >
		<Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/>
		<Filter name = "with_ccds" excluded = "0"/>
		<Attribute name = "ensembl_transcript_id" />
		<Attribute name = "ccds" />
	</Dataset>
</Query>
"""
ccds_filter = '<Filter name = "with_ccds" excluded = "0"/>'
query_template = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query  virtualSchemaName = "default" formatter = "TSV" header = "1" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
        <Dataset name = "__ENSEMBL_DATASET_HERE__" interface = "default" >
                <Filter name = "ensembl_transcript_id" value = "__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__"/>
                <Filter name = "biotype" value = "protein_coding"/>
                __CCDS_FILTER_HERE__
                <Attribute name = "cdna" />
                <Attribute name = "ensembl_gene_id" />
                <Attribute name = "ensembl_transcript_id" />
                <Attribute name = "strand" />
                <Attribute name = "transcript_start" />
	        <Attribute name = "transcript_end"/>
	        <Attribute name = "exon_chrom_start"/>
	        <Attribute name = "exon_chrom_end"/>
		<Attribute name = "cdna_coding_start" />
		<Attribute name = "cdna_coding_end" />
	        <Attribute name = "cds_length"/>
	        <Attribute name = "rank"/>
                <Attribute name = "5_utr_start" />
                <Attribute name = "5_utr_end" />
                <Attribute name = "3_utr_start" />
                <Attribute name = "3_utr_end" />
	        <Attribute name = "ensembl_peptide_id"/>
		<Attribute name = "start_position" />
		<Attribute name = "end_position" />
        </Dataset>
</Query>
"""
"""
Columns(19):
['cDNA sequences', 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Strand', 'Transcript Start (bp)', 'Transcript End (bp)', 'Exon Chr Start (bp)', 'Exon Chr End (bp)', 'cDNA coding start', 'cDNA coding end', 'CDS Length', 'Exon Rank in Transcript', "5' UTR Start", "5' UTR End", "3' UTR Start", "3' UTR End", 'Ensembl Protein ID', 'Gene Start (bp)', 'Gene End (bp)']

  0	cDNA sequence
  1	Ensembl Gene ID
  2	Ensembl Transcript ID
  3	Strand
- 4	Transcript Start (bp)
- 5	Transcript End (bp)
  6	Exon Chr Start (bp)
  7	Exon Chr End (bp)
  8	CDS Start
  9	CDS End
  10	CDS Length
- 11	Exon Rank in Transcript
  12	5' UTR Start
  13	5' UTR End
  14	3' UTR Start
  15	3' UTR End
- 16	Ensembl Protein ID
- 17	Gene Start (bp)
- 18	Gene End (bp)
"""

# return {transcript_id : ccds_id} 
def getCcdsIDs(bimoart_host, ensembl_dataset, transcript_ids):
  ccds_dict = dict()
  if transcript_ids:
    query = query_ccds_template
    query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query)
    query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',','.join(transcript_ids),query)
    params = urllib.urlencode({'query':query})
    headers = {"Accept": "text/plain"}
    if debug: print >> sys.stdout, "CCDS Query: %s\n" % (query)
    try:
      if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host)
      conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org')
      conn.request("POST", "/biomart/martservice", params, headers)
      response = conn.getresponse()
      data = response.read()
      if len(data) > 0:
        # if debug: print  >> sys.stdout, "%s\n\n" % data
        lines = data.split('\n')
        for line in lines:
          fields = line.split('\t')
          if len(fields) == 2:
            (transcript_id,ccds) = fields
            ccds_dict[transcript_id] = ccds
        if debug: print >> sys.stdout, "CCDS response: %s\n" % (lines)
    except Exception, e:
      sys.stdout.write( "transcript_ids: %s  %s\n" % (transcript_ids, e) )
  return ccds_dict

def getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds=True):
  query = query_template
  query = re.sub('__ENSEMBL_DATASET_HERE__',ensembl_dataset if ensembl_dataset else 'hsapiens_gene_ensembl',query)
  query = re.sub('__YOUR_ENSEMBL_TRANSCRIPT_ID_HERE__',transcript_id,query)
  query = re.sub('__CCDS_FILTER_HERE__',ccds_filter if filter_ccds else '',query)
  return query

# return [ensembl_gene_id,ensembl_transcript_id,strand,cds_pos,cds_ref,cds_alt,coding_sequence, variant_sequence]
def getEnsemblInfo(snpEffect,bimoart_host,ensembl_dataset,filter_ccds=True):
  transcript_id = snpEffect.transcript
  chr_pos = snpEffect.pos
  query = getBiomartQuery(transcript_id,ensembl_dataset,filter_ccds)
  if debug: print  >> sys.stdout, "getEnsemblInfo:\n%s\n" % (query)
  params = urllib.urlencode({'query':query})
  headers = {"Accept": "text/plain"}
  pos = snpEffect.pos
  cds_pos = None  # zero based offset
  coding_sequence = ''
  cds_ref = snpEffect.ref
  cds_alt = snpEffect.alt
  try: 
    if debug: print >> sys.stdout, "Ensembl Host: %s\n" % (bimoart_host)
    conn = httplib.HTTPConnection(bimoart_host if bimoart_host != None else 'www.biomart.org')
    conn.request("POST", "/biomart/martservice", params, headers)
    response = conn.getresponse()
    data = response.read()
    if len(data) > 0:
      # if debug: print  >> sys.stdout, "%s\n\n" % data
      lines = data.split('\n')
      # Use the header line to determine the order of fields
      line = lines[0]
      # if debug: print  >> sys.stdout, "Headers:\n%s\n" % line
      colnames = line.split('\t')
      # for i in range(len(colnames)):
      #
      if debug: print  >> sys.stdout, "Columns(%d):\n%s\n" % (len(colnames), colnames)
      for line in lines[1:]:
        if line == None or len(line) < 2: 
          continue
        field = line.split('\t')
        if len(field) != len(colnames):
          continue
        if debug: print  >> sys.stdout, "Entry(%d):\n%s\n" % (len(field),line)
        if field[10] == None or len(field[10]) < 1:
          continue
        if debug:
          for i in range(len(colnames)):
            print  >> sys.stdout, "%d\t%s :\n%s\n" % (i,colnames[i],field[i])
        snpEffect.gene_id = field[1]
        strand = '+' if int(field[3]) > 0 else '-'
        snpEffect.strand = strand
        # coding_sequence is first
        if len(field) > 10 and re.match('[ATGCN]+',field[0]):
          if field[10] == None or len(field[10]) < 1:
            continue
          cdna_seq = field[0]
          in_utr = False
          # Could be mutliple UTRs exons - sum up lengths for cds offset into cdna sequence
          utr_5_starts = sorted(eval('[' + re.sub(';',',',field[12]) + ']'),reverse=(strand == '-'))
          utr_5_ends = sorted(eval('[' + re.sub(';',',',field[13]) + ']'),reverse=(strand == '-'))
          utr_5_lens = []
          for i in range(len(utr_5_starts)):
            utr_5_start = int(utr_5_starts[i])
            utr_5_end = int(utr_5_ends[i])
            utr_5_lens.append(abs(utr_5_end - utr_5_start) + 1)
            if utr_5_start <= pos  <= utr_5_end :
              in_utr = True
              if debug: print  >> sys.stdout, "in utr_5: %s     %s     %s\n" % (utr_5_start,pos,utr_5_end);
              break
          utr_3_starts = sorted(eval('[' + re.sub(';',',',field[14]) + ']'),reverse=(strand == '-'))
          utr_3_ends = sorted(eval('[' + re.sub(';',',',field[15]) + ']'),reverse=(strand == '-'))
          for i in range(len(utr_3_starts)):
            utr_3_start = int(utr_3_starts[i])
            utr_3_end = int(utr_3_ends[i])
            if utr_3_start <= pos  <= utr_3_end :
              in_utr = True
              if debug: print  >> sys.stdout, "in utr_3: %s     %s     %s\n" % (utr_3_start,pos,utr_3_end);
              break
          # Get coding sequence from cdna
          cdna_length = int(field[10])
          cdna_coding_start =  sorted(eval('[' + re.sub(';',',',field[8]) + ']'))
          cdna_coding_end  = sorted(eval('[' + re.sub(';',',',field[9]) + ']'))
          cdna_lens = [] 
          for i in range(len(cdna_coding_start)):
            cdna_lens.append(cdna_coding_end[i] - cdna_coding_start[i] + 1)
          if debug: print  >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens)
          cdna_cds_offset = cdna_coding_start[0] - 1 # 0-based array offet
          for i in range(len(cdna_coding_start)):
            if debug: print  >> sys.stdout, "%s\n" % cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]+1]
            coding_sequence += cdna_seq[cdna_coding_start[i]-1:cdna_coding_end[i]]
          snpEffect.cds = coding_sequence
          if coding_sequence and len(coding_sequence) >= 3:
            snpEffect.cds_stop_codon = coding_sequence[-3:]
          if debug: print  >> sys.stdout, "coding_seq (%d from %d):\n%s" % (len(coding_sequence),cdna_length,coding_sequence)
          if debug: print  >> sys.stdout, "cdna_coding (%d):\n %s\n %s\n %s\n" % (len(cdna_coding_start),cdna_coding_start,cdna_coding_end,cdna_lens)
          if debug: print  >> sys.stdout, "5_utr %s %s %s\n" % (utr_5_starts,utr_5_ends,utr_5_lens)
          if  not in_utr:
            exon_start = sorted(eval('[' + re.sub(';',',',field[6]) + ']'),reverse=(strand == '-'))
            exon_end = sorted(eval('[' + re.sub(';',',',field[7]) + ']'),reverse=(strand == '-'))
            exon_rank = sorted(eval('[' + re.sub(';',',',field[11]) + ']'),reverse=(strand == '-'))
            exon_lens = [] 
            for i in range(len(exon_start)):
              exon_lens.append(exon_end[i] - exon_start[i] + 1)
            if debug: print  >> sys.stdout, "exons (%d) strand = %s :\n %s\n %s\n %s\n" % (len(exon_start),strand,exon_start,exon_end, exon_lens)
            if debug: 
              bp_tot = 0
              for i in range(len(exon_start)):
                exon_len = exon_end[i] - exon_start[i] + 1
                bp_tot += exon_len
                print >> sys.stdout, "test: %s     %s     %s     %s     %s    %d   %d\n" % (exon_start[i],pos,exon_end[i],exon_start[i] < pos < exon_end[i], pos - exon_start[i], exon_len, bp_tot)
            cds_idx = 0
            for i in range(len(exon_start)):
              # Does this exon have cds bases - i.e. not entirely in UTR
              if len(utr_5_ends) > 0:
                if strand == '+' and len(utr_5_ends) > 0 and exon_end[i] <= utr_5_ends[-1]:
                  continue
                if strand == '-' and len(utr_5_starts) > 0 and exon_start[i] >= utr_5_starts[-1]:
                  continue
              exon_len = exon_end[i] - exon_start[i] + 1
              if exon_start[i] <= pos <= exon_end[i]:
                if strand == '+':
                  if cds_idx: # find offset from start of cdna_coding and exon
                    offset = pos - exon_start[i]
                  else: # find offset from end of cdna_coding and exon
                    offset = pos - ( exon_start[i] if len(utr_5_ends) < 1 else max(exon_start[i], utr_5_ends[-1]+1) )
                  cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset
                else:  # negative strand
                  cds_ref = reverseComplement(snpEffect.ref)
                  cds_alt = reverseComplement(snpEffect.alt)
                  offset = ( exon_end[i] if len(utr_5_starts) < 1 else min(exon_end[i], utr_5_starts[-1] -1) ) - pos
                  cds_pos = cdna_coding_start[cds_idx] - cdna_coding_start[0] + offset - (len(cds_ref) - 1)
                snpEffect.cds_pos = cds_pos
                snpEffect.cds_ref = cds_ref
                snpEffect.cds_alt = cds_alt
                return snpEffect
              else:  
                cds_idx += 1
  except Exception, e:
    sys.stdout.write( "transcript_id: %s %s %s\n" % (transcript_id,chr_pos, e) )
  finally:
    if conn != None :
      conn.close()
  return None

"""
  Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
  FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
"""
class SnpEffect( object ):
  report_headings = ['Gene','Variant Position','Reference','Variation','Prevalence','Sequencing Depth','Transcript','AA Position','AA change','AA Length','Stop Codon','Stop Region','AA Variation']
  def __init__(self,chrom,pos,ref,alt,freq,depth,effect = None, snpEffVersion = None, biomart_host = None, filter_ccds = False):
    self.chrom = chrom
    self.pos = int(pos)
    self.ref = ref
    self.alt = alt
    self.freq = float(freq) if freq else None
    self.depth = int(depth) if depth else None
    # From SnpEffect VCF String
    self.effect = None
    self.effect_impact = None
    self.functional_class = None
    self.codon_change = None
    self.amino_acid_change = None
    self.amino_acid_length = None
    self.gene_name = None
    self.gene_id = None
    self.gene_biotype = None
    self.coding = None
    self.transcript = None
    self.transcript_ids = None
    self.exon = None
    self.cds_stop_codon = None
    self.cds_stop_pos = None
    # retrieve from ensembl
    self.strand = None
    self.cds_pos = None
    self.cds_ref = None
    self.cds_alt = None
    self.cds = None
    self.aa_pos = None
    self.aa_len = None
    self.alt_stop_pos = None
    self.alt_stop_codon = None
    self.alt_stop_region = None  ## includes base before and after alt_stop_codon
    self.use_eff_aa_change = False
  def chrPos(self):
    return "%s:%s" % (self.chrom,self.pos)
  def setEffect(self, effect, snpEffVersion = None):
    if snpEffVersion and snpEffVersion == '2':
      (effect_impact,functional_class,codon_change,amino_acid_change,gene_name,gene_biotype,coding,transcript,exon) = effect[0:9]
    else: # SnpEffVersion 3  # adds Amino_Acid_length field
      (effect_impact,functional_class,codon_change,amino_acid_change,amino_acid_length,gene_name,gene_biotype,coding,transcript,exon) = effect[0:10]
      self.amino_acid_length = amino_acid_length
    self.effect_impact = effect_impact
    self.functional_class = functional_class
    self.codon_change = codon_change
    self.amino_acid_change = amino_acid_change
    self.gene_name = gene_name
    self.gene_biotype = gene_biotype
    self.coding = coding
    self.transcript = transcript
    self.exon = exon
  def parseEffect(self, effect, snpEffVersion = None):
    (eff,effs) = effect.rstrip(')').split('(')
    self.effect = eff
    eff_fields = effs.split('|')
    if debug: print >> sys.stdout, "parseEffect:\n\t%s\n\t%s\n" % (effect,eff_fields)
    self.setEffect(eff_fields, snpEffVersion)
  def fetchEnsemblInfo(self,biomart_host=None,ensembl_dataset=None,filter_ccds=False):
    getEnsemblInfo(self,biomart_host,ensembl_dataset,filter_ccds)
    return len(self.cds) > 0 if self.cds else False
  def score(self):
    return self.freq * self.depth if self.freq and self.depth else -1
  def getCodingSequence(self):
    return self.cds
  def getVariantSequence(self):
    if self.cds and self.cds_pos and self.cds_ref and self.cds_alt:
      return self.cds[:self.cds_pos] + self.cds_alt + self.cds[self.cds_pos+len(self.cds_ref):]
    else:
      if debug: print >> sys.stdout, "getVariantSequence:  %s\t%s\t%s\t%s\n" % (str(self.cds_pos) if self.cds_pos else 'no pos', self.cds_ref, self.cds_alt, self.cds)
      return None
  def getCodingAminoSequence(self):
    seq = translate(self.cds) if self.cds else None
    if seq:
      self.aa_len = len(seq)
    return seq
  def getVariantAminoSequence(self):
    variant_seq = self.getVariantSequence()
    return translate(variant_seq) if variant_seq else None
  def getVariantPeptide(self,toStopCodon=True):
    (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences()
    if var_aa:
      if toStopCodon:
        end_pos = var_aa_end
      else:
        end_pos = len(var_aa)
      novel_peptide = var_aa[var_aa_pos:end_pos]
      return novel_peptide
    return None
  # [preAA,varAA,postAA, varPeptide, varOffset, subAA]
  def getNonSynonymousPeptide(self,start_count,end_count,toStopCodon=True):
    (coding_seq, alt_seq, pos, coding_aa, var_aa, var_aa_pos, var_aa_end) = self.getSequences()
    if var_aa:
      start_pos = max(var_aa_pos - start_count,0) if start_count and int(start_count) >= 0 else 0
      if toStopCodon:
        end_pos = var_aa_end
      else:
        end_offset = end_count + 1
        end_pos = min(var_aa_pos+end_offset,len(var_aa)) if end_offset and int(end_offset) >= 0 else var_aa_end
      try:
        varAA = var_aa[var_aa_pos] if var_aa_pos < len(var_aa) else '_' 
        if debug: print >> sys.stdout, "HGVS %s %s pos:\t%d %d %d" % (self.transcript, self.effect, start_pos, var_aa_pos, end_pos)
        mutation = "%s%d%s" % (coding_aa[var_aa_pos],var_aa_pos+1,varAA)
        preAA = var_aa[start_pos:var_aa_pos] # N-term side
        postAA = var_aa[var_aa_pos+1:end_pos] if var_aa_pos+1 < len(var_aa) else '' # C-term side
        novel_peptide = var_aa[start_pos:end_pos]
        return [preAA,varAA,postAA,novel_peptide,var_aa_pos - start_pos,mutation]
      except Exception, e:
        sys.stdout.write( "getNonSynonymousPeptide:%s %s\n" % (self.transcript,e) )
    return None
  # [coding_dna, variant_dna, cds_pos, coding_aa, variant_aa, aa_start, aa_stop]
  # positions are 0-based array indexes 
  def getSequences(self):
    coding_dna = self.getCodingSequence()
    coding_aa = self.getCodingAminoSequence()
    var_dna = self.getVariantSequence()
    var_aa = self.getVariantAminoSequence()
    var_aa_pos = None
    var_aa_end = None
    if var_aa:
      var_aa_pos = self.cds_pos / 3
      for j in range(var_aa_pos,min(len(var_aa),len(coding_aa))):
        if var_aa[j] != coding_aa[j]:
          var_aa_pos = j
          break
      self.aa_pos = var_aa_pos
      var_stop = var_aa.find('*',var_aa_pos)
      if var_stop < 0:
        var_aa_end = len(var_aa)
      else:
        var_aa_end = var_stop + 1 # include the stop codon 
        stop_pos = var_stop * 3
        self.alt_stop_pos = stop_pos
        self.alt_stop_codon = var_dna[stop_pos:stop_pos+3]
        self.alt_stop_region = (var_dna[stop_pos - 1] + '-' if stop_pos > 0 else '') + \
                               self.alt_stop_codon + \
                               ( '-' + var_dna[stop_pos+3] if len(var_dna) > stop_pos+3  else '')
    return [coding_dna,var_dna,self.cds_pos,coding_aa,var_aa, var_aa_pos, var_aa_end]
  def inHomopolymer(self,polyA_limit):
    if polyA_limit:   ## library prep can results in inserting or deleting an A in a poly A region
      ## check if base changes at cds_pos involve A or T 
      bdiff = None # the difference between the cds_ref and cds_alt
      boff = 0 # the offset to the difference
      if len(self.cds_ref) < len(self.cds_alt):
        bdiff = re.sub(self.cds_ref,'',self.cds_alt)
        boff = self.cds_alt.find(bdiff)
      elif len(self.cds_alt) < len(self.cds_ref):
        bdiff = re.sub(self.cds_alt,'',self.cds_ref)
        boff = self.cds_ref.find(bdiff)
      if bdiff:
        ## check the number of similar base neighboring the change
        alt_seq = self.getVariantSequence()
        subseq = alt_seq[max(self.cds_pos-polyA_limit-2,0):min(self.cds_pos+polyA_limit+2,len(alt_seq))]
        ## adjust match length if the cps_pos is within polyA_limit form sequence end
        match_len = min(self.cds_pos,min(len(alt_seq)-self.cds_pos - boff,polyA_limit))
        if debug: print >> sys.stdout, "polyA bdiff %s   %s  %d  %d\n" % (bdiff,subseq, match_len, boff)
        ## Pattern checks for match of the repeated base between the polyA_limit and the length of the sub sequence times
        if bdiff.find('A') >= 0:
          pat = '^(.*?)(A{%d,%d})(.*?)$' % (match_len,len(subseq))
          match = re.match(pat,subseq)
          if debug: print >> sys.stdout, "%s %s %s  %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match')
          if match:
            return True
        if bdiff.find('T') >= 0:
          pat = '^(.*?)(T{%d,%d})(.*?)$' % (match_len,len(subseq))
          match = re.match(pat,subseq)
          if debug: print >> sys.stdout, "%s %s %s  %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups() if match else 'no match')
          if match:
            if debug: print >> sys.stdout, "%s %s %s  %s %s\n" % (self.transcript, self.cds_ref, self.cds_alt, subseq, match.groups())
            return True
    return False  
  def getReportHeader():
    return report_headings
  def getReportFields(self):
    gene_name = self.gene_name  if self.gene_name else ''
    cds_ref = self.cds_ref if self.cds_ref else ''
    cds_alt = self.cds_alt if self.cds_alt else ''
    hgvs = self.getNonSynonymousPeptide(10,10,toStopCodon=False)
    if debug: print >> sys.stdout, "HGVS %s" % hgvs
    var_aa = self.getVariantPeptide(toStopCodon=True)
    freq = "%2.2f" % self.freq if self.freq else ''
    depth = "%d" % self.depth if self.depth else ''
    aa_pos = "%d" % (self.aa_pos + 1) if self.aa_pos else ''
    aa_len = "%d" % self.aa_len if self.aa_len else ''
    gene_tx_id = '|'.join([self.gene_id,self.transcript]) if self.gene_id else self.transcript
    stop_codon = self.alt_stop_codon if self.alt_stop_codon else ''
    stop_region = self.alt_stop_region if self.alt_stop_region else ''
    chrpos = self.chrPos()
    aa_change = self.amino_acid_change if (self.use_eff_aa_change and self.amino_acid_change) else hgvs[5]
    return [gene_name,chrpos,cds_ref,cds_alt,freq,depth,gene_tx_id,aa_pos,aa_change,aa_len,stop_codon,stop_region,var_aa if var_aa else '']

def __main__():

  def printReportTsv(output_file, snpEffects):
    if output_file:
      print >> output_file, "# %s" % '\t'.join(SnpEffect.report_headings)
      for snpEffect in snpEffects:
        (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,stop_region,novel_peptide) = snpEffect.getReportFields()
        if not snpEffect.effect == 'FRAME_SHIFT':
          (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False)
          novel_peptide = "%s_%s_%s" %  (preAA,varAA,postAA)
        print >> output_file, "%s" % '\t'.join([gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_region,novel_peptide])

  """
  http://www.ensembl.org/Homo_sapiens/Search/Details?species=Homo_sapiens;idx=Gene;end=1;q=CCDC111
  http://www.ensembl.org/Homo_sapiens/Search/Results?species=Homo_sapiens;idx=;q=ENST00000314970
  http://www.ensembl.org/Homo_sapiens/Transcript/Summary?g=ENSG00000164306;r=4:185570821-185616117;t=ENST00000515774
  """
  def printReportHtml(output_file, detail_dir, snpEffects):
    if output_file:
      print >> output_file, '<HTML><BODY>\n'
      print >> output_file, '<TABLE BORDER="1">\n'
      print >> output_file, '<TR align="LEFT"><TH>Gene</TH><TH>Variant Position</TH><TH>Reference</TH><TH>Variation</TH><TH>Prevalence</TH><TH>Sequencing Depth</TH><TH>Transcript Details</TH><TH>AA Position</TH><TH>AA Change</TH><TH>AA Length</TH> <TH>Stop Codon</TH><TH>AA Variation</TH></TR>\n'
      for snpEffect in snpEffects:
        (gene_name,chrpos,cds_ref,cds_alt,freq,depth,transcript_name,alt_aa_pos,aa_change,aa_len,stop_codon,stop_region,novel_peptide) = snpEffect.getReportFields()
        refname = '_'.join([snpEffect.transcript,str(snpEffect.pos),snpEffect.ref,snpEffect.alt]) + '.html'
        aa_ref = '#'.join([refname,aa_anchor])
        refpath = os.path.join(detail_dir,refname)
        try:
          ref_file = open(refpath,'w')
          printEffDetailHtml(ref_file,snpEffect)
          ref_file.close()
        except Exception, e:
          sys.stderr.write( "SnpEffect:%s %s\n" % (refpath,e) )
        if snpEffect.effect == 'FRAME_SHIFT':
          print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s">%s</A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_region,aa_ref,novel_peptide)
        else:
          (preAA,varAA,postAA, allSeq, varOffset, subAA) = snpEffect.getNonSynonymousPeptide(10,10,toStopCodon=False)
          print >> output_file, '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD ALIGN=RIGHT>%s</TD><TD><A HREF="%s">%s</A></TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD ALIGN=RIGHT>%s</TD><TD>%s</TD><TD><A HREF="%s"><small><I>%s</I></small><B>%s</B><small><I>%s</I></small></A></TD></TR>\n' % (gene_name,chrpos,cds_ref,cds_alt,freq,depth,refname,transcript_name,alt_aa_pos,aa_change,aa_len,stop_region,aa_ref, preAA,varAA,postAA)
      print >> output_file, '</TABLE>\n'
      print >> output_file, '</BODY></HTML>\n'

  def printEffDetailHtml(output_file,snpEffect,wrap_len=60):
    (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences()
    seq_len = len(coding_seq)
    print >> output_file, '<HTML><BODY>\n'
    print >> output_file, '<TABLE>\n'
    print >> output_file, '<TR><TD>Gene:</TD><TD>%s</TD></TR>\n' % snpEffect.gene_name
    print >> output_file, '<TR><TD>Allele:</TD><TD>%s/%s</TD></TR>\n' % (snpEffect.cds_ref,snpEffect.cds_alt)
    print >> output_file, '<TR><TD>Transcript:</TD><TD>%s|%s</TD></TR>\n' % (snpEffect.gene_id,snpEffect.transcript)
    print >> output_file, '<TR><TD>Transcript Strand:</TD><TD>%s</TD></TR>\n' % snpEffect.strand
    print >> output_file, '<TR><TD>Position in transcript:</TD><TD><A HREF="%s">%d</A></TD></TR>\n' % ("#%s" % cds_anchor,(snpEffect.cds_pos + 1))
    print >> output_file, '</TABLE>\n'
    if alt_aa:
      alt_aa_pos = pos / 3 
      if coding_aa:
        for j in range(alt_aa_pos,min(len(alt_aa),len(coding_aa))):
          if alt_aa[j] != coding_aa[j]:
            alt_aa_pos = j
            break
      alt_aa_end = 0
      for j in range(alt_aa_pos,len(alt_aa)):
        if alt_aa[j] == '*':
          alt_aa_end = j
          break
    seq = []
    for j in range(1,wrap_len+1):
      seq.append('-' if j % 10 else '|')
    hrule = '</TD><TD>'.join(seq)
    print >> output_file, '<TABLE cellspacing="0">'
    for i in range(0,seq_len,wrap_len):
      e = min(i + wrap_len,seq_len)
      sa = i/3 
      ea = (i+wrap_len)/3
      print >> output_file, "<TR STYLE=\"background:#DDD\"><TD ALIGN=RIGHT></TD><TD>%s</TD><TD></TD ALIGN=RIGHT></TR>\n" % (hrule)
      print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1)
      for j in range(i,i + wrap_len):
        if j < len(coding_seq):
          if pos == j:
            print >> output_file, "<TD><FONT COLOR=\"#F00\"><A NAME=\"%s\">%s</A></FONT></TD>" % (cds_anchor,coding_seq[j])
          elif pos <= j < pos + len(ref):
            print >> output_file, "<TD><FONT COLOR=\"#F00\">%s</FONT></TD>" % coding_seq[j]
          else:
            print >> output_file, "<TD>%s</TD>" % coding_seq[j]
        else:
          print >> output_file, "<TD></TD>"
      print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e
      if coding_aa:
        print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1)
        for j in range(sa,ea):
          if j < len(coding_aa):
            print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % coding_aa[j]
          else:
            print >> output_file, "<TD COLSPAN=\"3\"></TD>"
        print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea
      if alt_aa:
        print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (sa+1)
        for j in range(sa,ea):
          if j < len(alt_aa):
            if alt_aa_pos == j:
              print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\"><A NAME=\"%s\">%s</A></TD>" % (aa_anchor,alt_aa[j])
            elif alt_aa_pos < j <= alt_aa_end:
              print >> output_file, "<TD COLSPAN=\"3\" BGCOLOR=\"#8F8\">%s</TD>" % alt_aa[j]
            else:
              print >> output_file, "<TD COLSPAN=\"3\">%s</TD>" % alt_aa[j]
          else:
            print >> output_file, "<TD COLSPAN=\"3\"></TD>"
        print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % ea
      print >> output_file, "<TR><TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD>" % (i+1)
      for j in range(i,i + wrap_len):
        if j < len(alt_seq):
          if pos <= j < pos + len(alt):
            print >> output_file, "<TD><FONT COLOR=\"#00F\">%s</FONT></TD>" % alt_seq[j]
          else:
            print >> output_file, "<TD>%s</TD>" % alt_seq[j]
        else:
          print >> output_file, "<TD></TD>"
      print >> output_file, "<TD BGCOLOR=#DDD ALIGN=RIGHT>%d</TD></TR>\n" % e
    print >> output_file, "</TABLE>"
    print >> output_file, "</BODY></HTML>"

  def printSeqText(output_file,snpEffect, wrap_len=120):
    nw = 6
    (coding_seq, alt_seq, pos, coding_aa, alt_aa, alt_aa_pos, alt_aa_end) = snpEffect.getSequences()
    if coding_seq:
      seq_len = max(len(coding_seq),len(alt_seq) if alt_seq != None else 0)
      rule = '---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|---------|'
      for i in range(0,seq_len,wrap_len):
        e = min(i + wrap_len,seq_len)
        sa = i/3 
        ea = e/3
        print >> output_file, "%*s %-*s %*s" % (nw,'',wrap_len,rule[:wrap_len],nw,'')
        print >> output_file, "%*d %-*s %*d" % (nw,i+1,wrap_len,coding_seq[i:e],nw,e)
        if coding_aa:
          print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,'  '.join(coding_aa[sa:ea]),nw,ea)
        if alt_aa:
          print >> output_file, "%*d %-*s %*d" % (nw,sa+1,wrap_len,'  '.join(alt_aa[sa:ea]),nw,ea)
        if alt_seq:
          print >> output_file, "%*d %-*s %*d\n" % (nw,i+1,wrap_len,alt_seq[i:e],nw,e)

  # Parse the command line options
  usage = "Usage: snpEff_cds_report.py filter [options]"
  parser = optparse.OptionParser(usage = usage)
  parser.add_option("-i", "--in",
                    action="store", type="string",
                    dest="vcfFile", help="input vcf file")
  parser.add_option("-o", "--out",
                    action="store", type="string",
                    dest="output", help="output report file")
  parser.add_option("-H", "--html_report",
                    action="store", type="string",
                    dest="html_file", help="html output report file")
  parser.add_option("-D", "--html_dir",
                    action="store", type="string",
                    dest="html_dir", help="html output report file")
  parser.add_option("-t", "--tsv_file",
                    action="store", type="string",
                    dest="tsv_file", help="TAB Separated Value (.tsv) output report file")
  parser.add_option("-e", "--ensembl_host",
                    action="store", type="string", 
                    dest="ensembl_host", default='www.biomart.org', help='bimoart ensembl server default: www.biomart.org')
  parser.add_option("-f", "--effects_filter",
                    action="store", type="string", default=None,
                    dest="effects_filter", help="a list of effects to filter")
  parser.add_option("-O", "--ensembl_dataset",
                    action="store", type="string", 
                    dest="ensembl_dataset", default='hsapiens_gene_ensembl', help='bimoart ensembl dataset default: hsapiens_gene_ensembl')
  parser.add_option("-p", "--polyA_limit",
                    action="store", type="int", default=None, help='ignore variants that are in a poly A longer than this value' )
  parser.add_option('-S', '--snpeff_aa_change', dest='snpeff_aa_change', action='store_true', default=False, help='Report the SnpEff Amino_Acid_change field'  )
  parser.add_option('-a', '--all_effects', dest='all_effects', action='store_true', default=False, help='Search for all effects'  )
  parser.add_option('-c', '--with_ccds', dest='with_ccds', action='store_true', default=False, help='Only variants with CCDS entries'  )
  parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout'  )

  (options, args) = parser.parse_args()

  debug = options.debug
  ensembl_host = options.ensembl_host

  # Check that a single vcf file is given.
  if options.vcfFile == None:
    parser.print_help()
    print >> sys.stderr, "\nInput vcf file (-i, --input) is required for variant report "
    exit(1)
  outputFile = None
  outputHtml = None
  detailDir = None
  outputTsv = None
  tmpHtmlName = None
  tmpHtml = None
  effects_list = []
  # Set the output file to stdout if no output file was specified.
  if options.output == None and options.html_file == None and options.tsv_file == None:
    outputFile = sys.stdout
  if options.output != None:
    output = os.path.abspath(options.output)
    outputFile = open(output, 'w')
  if options.tsv_file != None:
    output = os.path.abspath(options.tsv_file)
    outputTsv = open(output, 'w')
  if options.html_file != None:
    output = os.path.abspath(options.html_file)
    outputHtml = open(output, 'w')
    if options.html_dir == None:
      detailDir = os.path.dirname(os.path.abspath(output))
    else:
      detailDir = options.html_dir
  if detailDir:
    if not os.path.isdir(detailDir):
      os.makedirs(detailDir)
  if options.effects_filter:
    eff_codes = options.effects_filter.split(',')
    for val in eff_codes:
      code = val.strip()
      if code:
        effects_list.append(code)
      
  # 
  # Effect ( Effefct_Impact | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
  try:
    snpEffects = []
    homopolymers = []
    polya_count = 0
    polya_list = []
    fh = open( options.vcfFile )
    while True:
      line = fh.readline()
      if not line:
        break #EOF
      if line:
        if outputFile:
          print >> outputFile, "%s\n" % line
        line = line.strip()
        if len(line) < 1:
          continue
        elif line.startswith('#'):
          # check for SnpEffVersion
          """
          ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani"
          ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani"
          """
          if line.startswith('##SnpEffVersion='):
            m = re.match('##SnpEffVersion="(SnpEff )*([1-9])+.*$',line)
            if m:
              SnpEffVersion = m.group(2)
          # check for SnpEff Info Description
          """
          ##SnpEffVersion="2.1a (build 2012-04-20), by Pablo Cingolani"
          ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' ">
          ##SnpEffVersion="SnpEff 3.0a (build 2012-07-08), by Pablo Cingolani"
          ##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' ">
          """
          pass
        else:
          if debug: print >> sys.stdout, "%s\n" % line
          freq = None
          depth = None
          fields = line.split('\t')
          (chrom,pos,id,ref,alts,qual,filter,info) = fields[0:8]
          if debug: print  >> sys.stdout, "%s:%s-%s" % (chrom,int(pos)-10,int(pos)+10) 
          if options.debug: print(fields)
          for idx,alt in enumerate(alts.split(',')):
            if options.debug: print >> sys.stdout, "alt: %s from: %s" % (alt,alts)
            if not re.match('^[ATCG]*$',alt):
              print >> sys.stdout, "only simple variant currently supported, ignoring: %s:%s  %s\n" % (chrom,pos,alt)
            for info_item in info.split(';'):
              try:
                if info_item.find('=') < 0:
                  continue
                (key,val) = info_item.split('=',1)  
                if key == 'SAF':   # Usually a SAF for each variant: if  A,AG   then SAF=0.333333333333333,SAF=0.333333333333333;
                  freqs = info_item.split(',')
                  (k,v) = freqs[idx].split('=',1)
                  freq = v
                elif key == 'DP':
                  depth = val
                elif key == 'EFF':
                  transcript_ids = []
                  effects = val.split(',')
                  ccds_dict = None
                  for effect in effects:
                    (eff,effs) = effect.rstrip(')').split('(')
                    eff_fields = effs.split('|')
                    if SnpEffVersion == '2':
                      """
                      Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
                      EFF=FRAME_SHIFT(HIGH||||CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
                      """
                      (impact,functional_class,codon_change,aa_change,gene_name,biotype,coding,transcript,exon) = eff_fields[0:9]
                    else: # SnpEffVersion 3  # adds Amino_Acid_length field
                      """
                      Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )
                      FRAME_SHIFT(HIGH||||745|CHUK|protein_coding|CODING|ENST00000370397|exon_10_101964263_101964414)
                      """
                      (impact,functional_class,codon_change,aa_change,aa_len,gene_name,biotype,coding,transcript,exon) = eff_fields[0:10]
                    if transcript:
                      transcript_ids.append(transcript)
                  if debug: print  >> sys.stdout, "transcripts: %s" % (transcript_ids) 
                  if options.with_ccds:
                    ccds_dict = getCcdsIDs(ensembl_host,options.ensembl_dataset,transcript_ids)
                    if debug: print  >> sys.stdout, "ccds_dict: %s" % (ccds_dict) 
                  for effect in effects:
                    snpEffect = SnpEffect(chrom,pos,ref,alt,freq,depth)
                    snpEffect.transcript_ids = transcript_ids
                    snpEffect.parseEffect(effect,snpEffVersion=SnpEffVersion)
                    if options.snpeff_aa_change:
                      snpEffect.use_eff_aa_change = True
                    if effects_list and not snpEffect.effect in effects_list:
                      continue
                    if snpEffect.transcript and (not options.with_ccds or snpEffect.transcript in ccds_dict):
                      if snpEffect.fetchEnsemblInfo(ensembl_host,options.ensembl_dataset,options.with_ccds):
                        if snpEffect.cds_pos:
                          if snpEffect.inHomopolymer(options.polyA_limit):
                            homopolymers.append(snpEffect)
                          else:
                            snpEffects.append(snpEffect)
                    if outputFile:
                      print >> outputFile, "%s" % '\t'.join(snpEffect.getReportFields())
                      printSeqText(outputFile,snpEffect)
                    if snpEffect.cds and snpEffect.cds_pos and not options.all_effects:
                      ## Just report the first sequence returned for a vcf line
                      break
              except Exception, e:
                sys.stderr.write( "line: %s err: %s\n" % (line,e) )
    print >> sys.stdout, "homopolymer changes not reported: %d" % len(homopolymers)
    # Sort snpEffects
    snpEffects.sort(cmp=lambda x,y: cmp(x.score(), y.score()),reverse=True)
    if outputHtml:
      printReportHtml(outputHtml, detailDir, snpEffects)
    if outputTsv:
      printReportTsv(outputTsv,snpEffects)
  except Exception, e:
    print(str(e))         

if __name__ == "__main__": __main__()