diff change_o/DefineClones.py @ 52:22dddabe3637 draft

Uploaded
author davidvanzessen
date Tue, 23 May 2017 08:32:58 -0400
parents c33d93683a09
children
line wrap: on
line diff
--- a/change_o/DefineClones.py	Tue May 16 08:52:48 2017 -0400
+++ b/change_o/DefineClones.py	Tue May 23 08:32:58 2017 -0400
@@ -10,6 +10,7 @@
 import os
 import re
 import sys
+import csv
 import numpy as np
 from argparse import ArgumentParser
 from collections import OrderedDict
@@ -25,56 +26,108 @@
 from presto.Multiprocessing import manageProcesses
 from presto.Sequence import getDNAScoreDict
 from changeo.Commandline import CommonHelpFormatter, getCommonArgParser, parseCommonArgs
-from changeo.Distance import getDNADistMatrix, getAADistMatrix, \
-                             hs1f_model, m1n_model, hs5f_model, \
-                             calcDistances, formClusters
+from changeo.Distance import distance_models, calcDistances, formClusters
 from changeo.IO import getDbWriter, readDbFile, countDbFile
 from changeo.Multiprocessing import DbData, DbResult
 
+## Set maximum field size for csv.reader
+csv.field_size_limit(sys.maxsize)
+
 # Defaults
 default_translate = False
 default_distance = 0.0
-default_bygroup_model = 'hs1f'
+default_index_mode = 'gene'
+default_index_action = 'set'
+default_bygroup_model = 'ham'
 default_hclust_model = 'chen2010'
 default_seq_field = 'JUNCTION'
 default_norm = 'len'
 default_sym = 'avg'
 default_linkage = 'single'
-
-# TODO:  should be in Distance, but need to be after function definitions
-# Amino acid Hamming distance
-aa_model = getAADistMatrix(mask_dist=1, gap_dist=0)
-
-# DNA Hamming distance
-ham_model = getDNADistMatrix(mask_dist=0, gap_dist=0)
+choices_bygroup_model = ('ham', 'aa', 'hh_s1f', 'hh_s5f', 'mk_rs1nf', 'mk_rs5nf', 'hs1f_compat', 'm1n_compat')
 
 
-# TODO:  this function is an abstraction to facilitate later cleanup
-def getModelMatrix(model):
+def indexByIdentity(index, key, rec, fields=None):
     """
-    Simple wrapper to get distance matrix from model name
+    Updates a preclone index with a simple key
+
+    Arguments:
+    index = preclone index from indexJunctions
+    key = index key
+    rec = IgRecord to add to the index
+    fields = additional annotation fields to use to group preclones;
+             if None use only V, J and junction length
+
+    Returns:
+    None. Updates index with new key and records.
+    """
+    index.setdefault(tuple(key), []).append(rec)
+
+
+def indexByUnion(index, key, rec, fields=None):
+    """
+    Updates a preclone index with the union of nested keys
 
     Arguments:
-    model = model name
+    index = preclone index from indexJunctions
+    key = index key
+    rec = IgRecord to add to the index
+    fields = additional annotation fields to use to group preclones;
+             if None use only V, J and junction length
 
-    Return:
-    a pandas.DataFrame containing the character distance matrix
+    Returns:
+    None. Updates index with new key and records.
     """
-    if model == 'aa':
-        return(aa_model)
-    elif model == 'ham':
-        return(ham_model)
-    elif model == 'm1n':
-        return(m1n_model)
-    elif model == 'hs1f':
-        return(hs1f_model)
-    elif model == 'hs5f':
-        return(hs5f_model)
+    # List of values for this/new key
+    val = [rec]
+    f_range = list(range(2, 3 + (len(fields) if fields else 0)))
+
+    # See if field/junction length combination exists in index
+    outer_dict = index
+    for field in f_range:
+        try:
+            outer_dict = outer_dict[key[field]]
+        except (KeyError):
+            outer_dict = None
+            break
+    # If field combination exists, look through Js
+    j_matches = []
+    if outer_dict is not None:
+        for j in outer_dict.keys():
+            if not set(key[1]).isdisjoint(set(j)):
+                key[1] = tuple(set(key[1]).union(set(j)))
+                j_matches += [j]
+    # If J overlap exists, look through Vs for each J
+    for j in j_matches:
+        v_matches = []
+        # Collect V matches for this J
+        for v in outer_dict[j].keys():
+            if not set(key[0]).isdisjoint(set(v)):
+                key[0] = tuple(set(key[0]).union(set(v)))
+                v_matches += [v]
+        # If there are V overlaps for this J, pop them out
+        if v_matches:
+            val += list(chain(*(outer_dict[j].pop(v) for v in v_matches)))
+            # If the J dict is now empty, remove it
+            if not outer_dict[j]:
+                outer_dict.pop(j, None)
+
+    # Add value(s) into index nested dictionary
+    # OMG Python pointers are the best!
+    # Add field dictionaries into index
+    outer_dict = index
+    for field in f_range:
+        outer_dict.setdefault(key[field], {})
+        outer_dict = outer_dict[key[field]]
+    # Add J, then V into index
+    if key[1] in outer_dict:
+        outer_dict[key[1]].update({key[0]: val})
     else:
-        sys.stderr.write('Unrecognized distance model: %s.\n' % model)
+        outer_dict[key[1]] = {key[0]: val}
 
 
-def indexJunctions(db_iter, fields=None, mode='gene', action='first'):
+def indexJunctions(db_iter, fields=None, mode=default_index_mode,
+                   action=default_index_action):
     """
     Identifies preclonal groups by V, J and junction length
 
@@ -90,27 +143,47 @@
     Returns: 
     a dictionary of {(V, J, junction length):[IgRecords]}
     """
+    # print(fields)
     # Define functions for grouping keys
     if mode == 'allele' and fields is None:
         def _get_key(rec, act):
-            return (rec.getVAllele(act), rec.getJAllele(act),
-                    None if rec.junction is None else len(rec.junction))
+            return [rec.getVAllele(act), rec.getJAllele(act),
+                    None if rec.junction is None else len(rec.junction)]
     elif mode == 'gene' and fields is None:
         def _get_key(rec, act):  
-            return (rec.getVGene(act), rec.getJGene(act),
-                    None if rec.junction is None else len(rec.junction))
+            return [rec.getVGene(act), rec.getJGene(act),
+                    None if rec.junction is None else len(rec.junction)]
     elif mode == 'allele' and fields is not None:
         def _get_key(rec, act):
             vdj = [rec.getVAllele(act), rec.getJAllele(act),
                     None if rec.junction is None else len(rec.junction)]
             ann = [rec.toDict().get(k, None) for k in fields]
-            return tuple(chain(vdj, ann))
+            return list(chain(vdj, ann))
     elif mode == 'gene' and fields is not None:
         def _get_key(rec, act):
             vdj = [rec.getVGene(act), rec.getJGene(act),
                     None if rec.junction is None else len(rec.junction)]
             ann = [rec.toDict().get(k, None) for k in fields]
-            return tuple(chain(vdj, ann))
+            return list(chain(vdj, ann))
+
+    # Function to flatten nested dictionary
+    def _flatten_dict(d, parent_key=''):
+        items = []
+        for k, v in d.items():
+            new_key = parent_key + [k] if parent_key else [k]
+            if isinstance(v, dict):
+                items.extend(_flatten_dict(v, new_key).items())
+            else:
+                items.append((new_key, v))
+        flat_dict = {None if None in i[0] else tuple(i[0]): i[1] for i in items}
+        return flat_dict
+
+    if action == 'first':
+        index_func = indexByIdentity
+    elif action == 'set':
+        index_func = indexByUnion
+    else:
+        sys.stderr.write('Unrecognized action: %s.\n' % action)
 
     start_time = time()
     clone_index = {}
@@ -127,35 +200,16 @@
 
         # Assigned passed preclone records to key and failed to index None
         if all([k is not None and k != '' for k in key]):
-            #print key
-            # TODO:  Has much slow. Should have less slow.
-            if action == 'set':
-                
-                f_range = list(range(2, 3 + (len(fields) if fields else 0)))
-                vdj_range = list(range(2))
-                
-                # Check for any keys that have matching columns and junction length and overlapping genes/alleles
-                to_remove = []
-                if len(clone_index) > (1 if None in clone_index else 0) and key not in clone_index:
-                    key = list(key)
-                    for k in clone_index:
-                        if k is not None and all([key[i] == k[i] for i in f_range]):
-                            if all([not set(key[i]).isdisjoint(set(k[i])) for i in vdj_range]):
-                                for i in vdj_range:  key[i] = tuple(set(key[i]).union(set(k[i])))
-                                to_remove.append(k)
-                
-                # Remove original keys, replace with union of all genes/alleles and append values to new key
-                val = [rec]
-                val += list(chain(*(clone_index.pop(k) for k in to_remove)))
-                clone_index[tuple(key)] = clone_index.get(tuple(key),[]) + val 
-
-            elif action == 'first':
-                clone_index.setdefault(key, []).append(rec)
+            # Update index dictionary
+            index_func(clone_index, key, rec, fields)
         else:
             clone_index.setdefault(None, []).append(rec)
 
     printProgress(rec_count, step=1000, start_time=start_time, end=True)
 
+    if action == 'set':
+        clone_index = _flatten_dict(clone_index)
+
     return clone_index
 
 
@@ -179,15 +233,20 @@
     a dictionary of lists defining {clone number: [IgRecords clonal group]}
     """
     # Get distance matrix if not provided
-    if dist_mat is None:  dist_mat = getModelMatrix(model)
+    if dist_mat is None:
+        try:
+            dist_mat = distance_models[model]
+        except KeyError:
+            sys.exit('Unrecognized distance model: %s' % args_dict['model'])
 
+    # TODO:  can be cleaned up with abstract model class
     # Determine length of n-mers
-    if model in ['hs1f', 'm1n', 'aa', 'ham']:
+    if model in ['hs1f_compat', 'm1n_compat', 'aa', 'ham', 'hh_s1f', 'mk_rs1nf']:
         nmer_len = 1
-    elif model in ['hs5f']:
+    elif model in ['hh_s5f', 'mk_rs5nf']:
         nmer_len = 5
     else:
-        sys.stderr.write('Unrecognized distance model: %s.\n' % model)
+        sys.exit('Unrecognized distance model: %s.\n' % model)
 
     # Define unique junction mapping
     seq_map = {}
@@ -197,7 +256,7 @@
         if len(seq) == 0:
             return None
 
-        seq = re.sub('[\.-]','N', str(seq))
+        seq = re.sub('[\.-]', 'N', str(seq))
         if model == 'aa':  seq = translate(seq)
 
         seq_map.setdefault(seq, []).append(ig)
@@ -210,7 +269,7 @@
     seqs = list(seq_map.keys())
 
     # Calculate pairwise distance matrix
-    dists = calcDistances(seqs, nmer_len, dist_mat, norm, sym)
+    dists = calcDistances(seqs, nmer_len, dist_mat, sym=sym, norm=norm)
 
     # Perform hierarchical clustering
     clusters = formClusters(dists, linkage, distance)
@@ -449,6 +508,7 @@
 
             # Define result object for iteration and get data records
             records = data.data
+            # print(data.id)
             result = DbResult(data.id, records)
 
             # Check for invalid data (due to failed indexing) and add failed result
@@ -901,7 +961,7 @@
                      database with records failing clonal grouping.
 
              required fields:
-                 SEQUENCE_ID, V_CALL or V_CALL_GENOTYPED, D_CALL, J_CALL, JUNCTION_LENGTH
+                 SEQUENCE_ID, V_CALL or V_CALL_GENOTYPED, D_CALL, J_CALL, JUNCTION
 
                  <field>
                      sequence field specified by the --sf parameter
@@ -926,32 +986,36 @@
     
     # Distance cloning method
     parser_bygroup = subparsers.add_parser('bygroup', parents=[parser_parent],
-                                        formatter_class=CommonHelpFormatter,
-                                        help='''Defines clones as having same V assignment,
-                                              J assignment, and junction length with
-                                              specified substitution distance model.''')
+                                           formatter_class=CommonHelpFormatter,
+                                           help='''Defines clones as having same V assignment,
+                                                J assignment, and junction length with
+                                                specified substitution distance model.''',
+                                           description='''Defines clones as having same V assignment,
+                                                       J assignment, and junction length with
+                                                       specified substitution distance model.''')
     parser_bygroup.add_argument('-f', nargs='+', action='store', dest='fields', default=None,
                              help='Additional fields to use for grouping clones (non VDJ)')
     parser_bygroup.add_argument('--mode', action='store', dest='mode', 
-                             choices=('allele', 'gene'), default='gene', 
+                             choices=('allele', 'gene'), default=default_index_mode,
                              help='''Specifies whether to use the V(D)J allele or gene for
                                   initial grouping.''')
-    parser_bygroup.add_argument('--act', action='store', dest='action', default='set',
-                             choices=('first', 'set'),
+    parser_bygroup.add_argument('--act', action='store', dest='action',
+                             choices=('first', 'set'), default=default_index_action,
                              help='''Specifies how to handle multiple V(D)J assignments
                                   for initial grouping.''')
     parser_bygroup.add_argument('--model', action='store', dest='model', 
-                             choices=('aa', 'ham', 'm1n', 'hs1f', 'hs5f'),
+                             choices=choices_bygroup_model,
                              default=default_bygroup_model,
-                             help='''Specifies which substitution model to use for
-                                  calculating distance between sequences. Where m1n is the
-                                  mouse single nucleotide transition/trasversion model
-                                  of Smith et al, 1996; hs1f is the human single
-                                  nucleotide model derived from Yaari et al, 2013; hs5f
-                                  is the human S5F model of Yaari et al, 2013; ham is
-                                  nucleotide Hamming distance; and aa is amino acid
-                                  Hamming distance. The hs5f data should be
-                                  considered experimental.''')
+                             help='''Specifies which substitution model to use for calculating distance
+                                  between sequences. The "ham" model is nucleotide Hamming distance and
+                                  "aa" is amino acid Hamming distance. The "hh_s1f" and "hh_s5f" models are
+                                  human specific single nucleotide and 5-mer content models, respectively,
+                                  from Yaari et al, 2013. The "mk_rs1nf" and "mk_rs5nf" models are
+                                  mouse specific single nucleotide and 5-mer content models, respectively,
+                                  from Cui et al, 2016. The "m1n_compat" and "hs1f_compat" models are
+                                  deprecated models provided backwards compatibility with the "m1n" and
+                                  "hs1f" models in Change-O v0.3.3 and SHazaM v0.1.4. Both
+                                  5-mer models should be considered experimental.''')
     parser_bygroup.add_argument('--dist', action='store', dest='distance', type=float, 
                              default=default_distance,
                              help='The distance threshold for clonal grouping')
@@ -977,22 +1041,25 @@
     parser_bygroup.set_defaults(group_func=indexJunctions)  
     parser_bygroup.set_defaults(clone_func=distanceClones)
     
-    
-    # Hierarchical clustering cloning method
-    parser_hclust = subparsers.add_parser('hclust', parents=[parser_parent],
+    # Chen2010
+    parser_chen = subparsers.add_parser('chen2010', parents=[parser_parent],
                                         formatter_class=CommonHelpFormatter,
-                                        help='Defines clones by specified distance metric on CDR3s and \
-                                              cutting of hierarchical clustering tree')
-#     parser_hclust.add_argument('-f', nargs='+', action='store', dest='fields', default=None,
-#                              help='Fields to use for grouping clones (non VDJ)')
-    parser_hclust.add_argument('--method', action='store', dest='method', 
-                             choices=('chen2010', 'ademokun2011'), default=default_hclust_model, 
-                             help='Specifies which cloning method to use for calculating distance \
-                                   between CDR3s, computing linkage, and cutting clusters')
-    parser_hclust.set_defaults(feed_func=feedQueueClust)
-    parser_hclust.set_defaults(work_func=processQueueClust)
-    parser_hclust.set_defaults(collect_func=collectQueueClust)
-    parser_hclust.set_defaults(cluster_func=hierClust)
+                                        help='''Defines clones by method specified in Chen, 2010.''',
+                                        description='''Defines clones by method specified in Chen, 2010.''')
+    parser_chen.set_defaults(feed_func=feedQueueClust)
+    parser_chen.set_defaults(work_func=processQueueClust)
+    parser_chen.set_defaults(collect_func=collectQueueClust)
+    parser_chen.set_defaults(cluster_func=hierClust)
+
+    # Ademokun2011
+    parser_ade = subparsers.add_parser('ademokun2011', parents=[parser_parent],
+                                        formatter_class=CommonHelpFormatter,
+                                        help='''Defines clones by method specified in Ademokun, 2011.''',
+                                        description='''Defines clones by method specified in Ademokun, 2011.''')
+    parser_ade.set_defaults(feed_func=feedQueueClust)
+    parser_ade.set_defaults(work_func=processQueueClust)
+    parser_ade.set_defaults(collect_func=collectQueueClust)
+    parser_ade.set_defaults(cluster_func=hierClust)
         
     return parser
 
@@ -1023,8 +1090,11 @@
                                    'linkage': args_dict['linkage'],
                                    'seq_field': args_dict['seq_field']}
 
-        # TODO:  can be cleaned up with abstract model class
-        args_dict['clone_args']['dist_mat'] = getModelMatrix(args_dict['model'])
+        # Get distance matrix
+        try:
+            args_dict['clone_args']['dist_mat'] = distance_models[args_dict['model']]
+        except KeyError:
+            sys.exit('Unrecognized distance model: %s' % args_dict['model'])
 
         del args_dict['fields']
         del args_dict['action']
@@ -1037,16 +1107,17 @@
         del args_dict['seq_field']
 
     # Define clone_args
-    if args.command == 'hclust':
-        dist_funcs = {'chen2010':distChen2010, 'ademokun2011':distAdemokun2011}
-        args_dict['clone_func'] = dist_funcs[args_dict['method']]
-        args_dict['cluster_args'] = {'method':  args_dict['method']}
-        #del args_dict['fields']
-        del args_dict['method']
+    if args.command == 'chen2010':
+        args_dict['clone_func'] = distChen2010
+        args_dict['cluster_args'] = {'method': args.command }
+
+    if args.command == 'ademokun2011':
+        args_dict['clone_func'] = distAdemokun2011
+        args_dict['cluster_args'] = {'method': args.command }
     
     # Call defineClones
     del args_dict['command']
     del args_dict['db_files']
     for f in args.__dict__['db_files']:
         args_dict['db_file'] = f
-        defineClones(**args_dict)
\ No newline at end of file
+        defineClones(**args_dict)