diff hla_dq.py @ 0:10a407fb5072 draft

planemo upload for repository https://github.com/ErasmusMC-Bioinformatics/galaxytools-emc/tree/master/tools/hla_dq commit d6273a8247a1cbb7df2b26b9e97cd1bd3faa4f61
author erasmus-medical-center
date Wed, 30 May 2018 07:50:21 -0400
parents
children 4fc47a3ff9e8
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hla_dq.py	Wed May 30 07:50:21 2018 -0400
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+'''
+Given HLA-A and HLA-B genes annotated with BLAST IMGT/HLA database
+genotypes, determine associated serotypes
+
+
+DQA1 | DQB1 | type
+--------------------
+02:01| 02:02| DQ2.2
+03:03| 02:02| DQ2.3
+05:01| 02:01| DQ2.3
+03:01| 03:02| DQ8
+03:02| 03:02| DQ8
+03:03| 03:02| DQ8
+
+Annotations are of form DQ[A|B]1*xx:yy[:zz[:vv]] where xx:yy is of interest.
+
+Example: "HLA:HLA11066 DQA1*01:05:02 768 bp"
+'''
+
+import argparse
+import itertools
+
+
+def to_matrix(l, n):
+    return [l[i:i+n] for i in range(0, len(l), n)]
+
+
+def get_list_of_associated_types(filename, column):
+    with open(filename) as f:
+        contents = f.readlines()
+
+    return [':'.join(line.split('\t')[column-1].split(' ')[1]
+            .split('*')[1].split(':')[0:2])
+            for line in contents[1:]]
+
+
+def get_associations(typesA, typesB):
+    ''' Given list of genotype annotations (e.g. DQA1*02:01..)
+        from A and B, determine possible associated serotypes
+    '''
+
+    ''' each combination of DQA1,DQB1,type '''
+    associated_combinations = [
+        ['02:01', '02:02', 'DQ2.2'],
+        ['03:03', '02:02', 'DQ2.3'],
+        ['05:01', '02:01', 'DQ2.5'],
+        ['03:01', '03:02', 'DQ8'],
+        ['03:02', '03:02', 'DQ8'],
+        ['03:03', '03:02', 'DQ8']]
+
+    return [a[2] for a in associated_combinations
+            if a[0] in typesA and a[1] in typesB]
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-A', required=True, action='append',
+                        help='BLAST hits A gene')
+    parser.add_argument('-B', required=True, action='append',
+                        help='BLAST hits B gene')
+    parser.add_argument('-c', '--column', default=5, type=int,
+                        help='Column number containing the BLAST annotation')
+    args = parser.parse_args()
+
+    # TODO: QC check that file A contains DQA1 annotations and
+    # B file contains DQB1 annotations?
+
+    # find possible associated types, for all combinations of alleles
+    typesA = [get_list_of_associated_types(A, args.column) for A in args.A]
+    typesB = [get_list_of_associated_types(B, args.column) for B in args.B]
+    associations = [get_associations(c[0], c[1])
+                    for c in itertools.product(typesA, typesB)]
+    associations = to_matrix(associations, len(args.B))
+
+    # write output table
+    header = '\t'+'\t'.join(['B'+str(i+1) for i in range(0, len(args.B))])+'\n'
+    bcount = 0
+    with open('results.tsv', 'w') as outfile:
+        outfile.write(header)
+        for line in associations:
+            bcount += 1
+            outfile.write(
+                'A' + str(bcount) + '\t' +
+                '\t'.join([';'.join(sorted(set(l)))
+                          if l else '-' for l in line])+'\n')