# HG changeset patch # User erasmus-medical-center # Date 1527681021 14400 # Node ID 10a407fb5072b1c50fe920ee1140fe17aa3ddf9d planemo upload for repository https://github.com/ErasmusMC-Bioinformatics/galaxytools-emc/tree/master/tools/hla_dq commit d6273a8247a1cbb7df2b26b9e97cd1bd3faa4f61 diff -r 000000000000 -r 10a407fb5072 hla_dq.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hla_dq.py Wed May 30 07:50:21 2018 -0400 @@ -0,0 +1,86 @@ +#!/usr/bin/env python +''' +Given HLA-A and HLA-B genes annotated with BLAST IMGT/HLA database +genotypes, determine associated serotypes + + +DQA1 | DQB1 | type +-------------------- +02:01| 02:02| DQ2.2 +03:03| 02:02| DQ2.3 +05:01| 02:01| DQ2.3 +03:01| 03:02| DQ8 +03:02| 03:02| DQ8 +03:03| 03:02| DQ8 + +Annotations are of form DQ[A|B]1*xx:yy[:zz[:vv]] where xx:yy is of interest. + +Example: "HLA:HLA11066 DQA1*01:05:02 768 bp" +''' + +import argparse +import itertools + + +def to_matrix(l, n): + return [l[i:i+n] for i in range(0, len(l), n)] + + +def get_list_of_associated_types(filename, column): + with open(filename) as f: + contents = f.readlines() + + return [':'.join(line.split('\t')[column-1].split(' ')[1] + .split('*')[1].split(':')[0:2]) + for line in contents[1:]] + + +def get_associations(typesA, typesB): + ''' Given list of genotype annotations (e.g. DQA1*02:01..) + from A and B, determine possible associated serotypes + ''' + + ''' each combination of DQA1,DQB1,type ''' + associated_combinations = [ + ['02:01', '02:02', 'DQ2.2'], + ['03:03', '02:02', 'DQ2.3'], + ['05:01', '02:01', 'DQ2.5'], + ['03:01', '03:02', 'DQ8'], + ['03:02', '03:02', 'DQ8'], + ['03:03', '03:02', 'DQ8']] + + return [a[2] for a in associated_combinations + if a[0] in typesA and a[1] in typesB] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-A', required=True, action='append', + help='BLAST hits A gene') + parser.add_argument('-B', required=True, action='append', + help='BLAST hits B gene') + parser.add_argument('-c', '--column', default=5, type=int, + help='Column number containing the BLAST annotation') + args = parser.parse_args() + + # TODO: QC check that file A contains DQA1 annotations and + # B file contains DQB1 annotations? + + # find possible associated types, for all combinations of alleles + typesA = [get_list_of_associated_types(A, args.column) for A in args.A] + typesB = [get_list_of_associated_types(B, args.column) for B in args.B] + associations = [get_associations(c[0], c[1]) + for c in itertools.product(typesA, typesB)] + associations = to_matrix(associations, len(args.B)) + + # write output table + header = '\t'+'\t'.join(['B'+str(i+1) for i in range(0, len(args.B))])+'\n' + bcount = 0 + with open('results.tsv', 'w') as outfile: + outfile.write(header) + for line in associations: + bcount += 1 + outfile.write( + 'A' + str(bcount) + '\t' + + '\t'.join([';'.join(sorted(set(l))) + if l else '-' for l in line])+'\n') diff -r 000000000000 -r 10a407fb5072 hla_dq.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hla_dq.xml Wed May 30 07:50:21 2018 -0400 @@ -0,0 +1,94 @@ + + Determine possible associated types given BLAST IMGT/HLA annotation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r 10a407fb5072 test-data/all_sideA3.blast.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/all_sideA3.blast.tsv Wed May 30 07:50:21 2018 -0400 @@ -0,0 +1,7 @@ +QuerySeq SubjectSeq Identity Length Info +bin1 HLA:HLA10211 100.000 271 HLA:HLA10211 DQA1*02:01 786 bp +bin1 HLA:HLA10210 100.000 271 HLA:HLA10210 DQA1*03:03 786 bp +bin1 HLA:HLA18123 100.000 271 HLA:HLA18123 DQA1*05:01 677 bp +bin1 HLA:HLA16190 100.000 271 HLA:HLA16190 DQA1*03:01 786 bp +bin1 HLA:HLA16160 100.000 271 HLA:HLA16160 DQA1*03:02 786 bp +bin1 HLA:HLA15585 100.000 271 HLA:HLA15585 DQA1*03:03 786 bp diff -r 000000000000 -r 10a407fb5072 test-data/all_sideB3.blast.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/all_sideB3.blast.tsv Wed May 30 07:50:21 2018 -0400 @@ -0,0 +1,4 @@ +QuerySeq SubjectSeq Identity Length Info +bin1 HLA:HLA10211 100.000 271 HLA:HLA10211 DQB1*02:02 786 bp +bin1 HLA:HLA10210 100.000 271 HLA:HLA10210 DQB1*02:01 786 bp +bin1 HLA:HLA18123 100.000 271 HLA:HLA18123 DQB1*03:02 677 bp diff -r 000000000000 -r 10a407fb5072 test-data/sideA1.blast.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sideA1.blast.tsv Wed May 30 07:50:21 2018 -0400 @@ -0,0 +1,16 @@ +QuerySeq SubjectSeq Identity Length Info +bin1 HLA:HLA16140 100.000 271 HLA:HLA16140 DQB1*02:83 786 bp +bin1 HLA:HLA13925 100.000 271 HLA:HLA13925 DQB1*02:63 677 bp +bin1 HLA:HLA12878 100.000 271 HLA:HLA12878 DQB1*02:53Q 783 bp +bin1 HLA:HLA00622 100.000 271 HLA:HLA00622 DQB1*02:01:01 786 bp +bin1 HLA:HLA14955 100.000 270 HLA:HLA14955 DQB1*02:72 552 bp +bin1 HLA:HLA13230 100.000 270 HLA:HLA13230 DQB1*02:57 552 bp +bin1 HLA:HLA09344 100.000 270 HLA:HLA09344 DQB1*02:27 552 bp +bin1 HLA:HLA09245 100.000 270 HLA:HLA09245 DQB1*02:14:01 552 bp +bin1 HLA:HLA08916 100.000 270 HLA:HLA08916 DQB1*02:08 552 bp +bin1 HLA:HLA08083 100.000 270 HLA:HLA08083 DQB1*02:07:01 552 bp +bin1 HLA:HLA15028 100.000 270 HLA:HLA15028 DQB1*02:01:24 552 bp +bin1 HLA:HLA08915 100.000 270 HLA:HLA08915 DQB1*02:01:07 552 bp +bin1 HLA:HLA08777 100.000 270 HLA:HLA08777 DQB1*02:01:06 552 bp +bin1 HLA:HLA08310 100.000 270 HLA:HLA08310 DQB1*02:01:05 552 bp +bin1 HLA:HLA05923 100.000 270 HLA:HLA05923 DQB1*02:01:04 552 bp diff -r 000000000000 -r 10a407fb5072 test-data/sideA2.blast.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sideA2.blast.tsv Wed May 30 07:50:21 2018 -0400 @@ -0,0 +1,64 @@ +QuerySeq SubjectSeq Identity Length Info +bin2 HLA:HLA10211 100.000 271 HLA:HLA10211 DQB1*03:93 786 bp +bin2 HLA:HLA10210 100.000 271 HLA:HLA10210 DQB1*03:92 786 bp +bin2 HLA:HLA18123 100.000 271 HLA:HLA18123 DQB1*03:276N 677 bp +bin2 HLA:HLA16190 100.000 271 HLA:HLA16190 DQB1*03:254 786 bp +bin2 HLA:HLA16160 100.000 271 HLA:HLA16160 DQB1*03:252 786 bp +bin2 HLA:HLA15585 100.000 271 HLA:HLA15585 DQB1*03:243 786 bp +bin2 HLA:HLA15476 100.000 271 HLA:HLA15476 DQB1*03:231 663 bp +bin2 HLA:HLA13930 100.000 271 HLA:HLA13930 DQB1*03:197Q 780 bp +bin2 HLA:HLA02208 100.000 271 HLA:HLA02208 DQB1*03:16 677 bp +bin2 HLA:HLA01574 100.000 271 HLA:HLA01574 DQB1*03:12 786 bp +bin2 HLA:HLA17695 100.000 271 HLA:HLA17695 DQB1*03:10:02:02 786 bp +bin2 HLA:HLA10276 100.000 271 HLA:HLA10276 DQB1*03:10:02:01 786 bp +bin2 HLA:HLA01164 100.000 271 HLA:HLA01164 DQB1*03:10:01 690 bp +bin2 HLA:HLA00630 100.000 271 HLA:HLA00630 DQB1*03:04:01 786 bp +bin2 HLA:HLA16168 100.000 271 HLA:HLA16168 DQB1*03:01:37 786 bp +bin2 HLA:HLA10540 100.000 271 HLA:HLA10540 DQB1*03:01:22 786 bp +bin2 HLA:HLA10209 100.000 271 HLA:HLA10209 DQB1*03:01:21 786 bp +bin2 HLA:HLA09634 100.000 271 HLA:HLA09634 DQB1*03:01:17 786 bp +bin2 HLA:HLA09096 100.000 271 HLA:HLA09096 DQB1*03:01:08 786 bp +bin2 HLA:HLA02688 100.000 271 HLA:HLA02688 DQB1*03:01:03 786 bp +bin2 HLA:HLA18096 100.000 271 HLA:HLA18096 DQB1*03:01:01:20 786 bp +bin2 HLA:HLA17597 100.000 271 HLA:HLA17597 DQB1*03:01:01:19 786 bp +bin2 HLA:HLA17466 100.000 271 HLA:HLA17466 DQB1*03:01:01:18 786 bp +bin2 HLA:HLA17462 100.000 271 HLA:HLA17462 DQB1*03:01:01:17 786 bp +bin2 HLA:HLA17461 100.000 271 HLA:HLA17461 DQB1*03:01:01:16 786 bp +bin2 HLA:HLA17460 100.000 271 HLA:HLA17460 DQB1*03:01:01:15 786 bp +bin2 HLA:HLA17369 100.000 271 HLA:HLA17369 DQB1*03:01:01:14 786 bp +bin2 HLA:HLA17367 100.000 271 HLA:HLA17367 DQB1*03:01:01:12 786 bp +bin2 HLA:HLA17366 100.000 271 HLA:HLA17366 DQB1*03:01:01:11 786 bp +bin2 HLA:HLA17365 100.000 271 HLA:HLA17365 DQB1*03:01:01:10 786 bp +bin2 HLA:HLA17335 100.000 271 HLA:HLA17335 DQB1*03:01:01:09 786 bp +bin2 HLA:HLA17197 100.000 271 HLA:HLA17197 DQB1*03:01:01:08 786 bp +bin2 HLA:HLA17167 100.000 271 HLA:HLA17167 DQB1*03:01:01:07 786 bp +bin2 HLA:HLA17162 100.000 271 HLA:HLA17162 DQB1*03:01:01:06 786 bp +bin2 HLA:HLA15507 100.000 271 HLA:HLA15507 DQB1*03:01:01:05 786 bp +bin2 HLA:HLA15506 100.000 271 HLA:HLA15506 DQB1*03:01:01:04 786 bp +bin2 HLA:HLA06616 100.000 271 HLA:HLA06616 DQB1*03:01:01:03 786 bp +bin2 HLA:HLA06613 100.000 271 HLA:HLA06613 DQB1*03:01:01:02 786 bp +bin2 HLA:HLA00625 100.000 271 HLA:HLA00625 DQB1*03:01:01:01 786 bp +bin2 HLA:HLA09837 100.000 270 HLA:HLA09837 DQB1*03:83 552 bp +bin2 HLA:HLA09654 100.000 270 HLA:HLA09654 DQB1*03:80 552 bp +bin2 HLA:HLA09566 100.000 270 HLA:HLA09566 DQB1*03:73 552 bp +bin2 HLA:HLA09095 100.000 270 HLA:HLA09095 DQB1*03:47 552 bp +bin2 HLA:HLA09093 100.000 270 HLA:HLA09093 DQB1*03:46 552 bp +bin2 HLA:HLA08289 100.000 270 HLA:HLA08289 DQB1*03:44 552 bp +bin2 HLA:HLA06177 100.000 270 HLA:HLA06177 DQB1*03:36 552 bp +bin2 HLA:HLA05767 100.000 270 HLA:HLA05767 DQB1*03:28 552 bp +bin2 HLA:HLA05374 100.000 270 HLA:HLA05374 DQB1*03:27 552 bp +bin2 HLA:HLA15599 100.000 270 HLA:HLA15599 DQB1*03:242 552 bp +bin2 HLA:HLA13500 100.000 270 HLA:HLA13500 DQB1*03:186 552 bp +bin2 HLA:HLA11586 100.000 270 HLA:HLA11586 DQB1*03:139 552 bp +bin2 HLA:HLA01589 100.000 270 HLA:HLA01589 DQB1*03:13 552 bp +bin2 HLA:HLA11128 100.000 270 HLA:HLA11128 DQB1*03:119 552 bp +bin2 HLA:HLA10928 100.000 270 HLA:HLA10928 DQB1*03:114 552 bp +bin2 HLA:HLA10663 100.000 270 HLA:HLA10663 DQB1*03:113 552 bp +bin2 HLA:HLA17148 100.000 270 HLA:HLA17148 DQB1*03:01:38 552 bp +bin2 HLA:HLA15778 100.000 270 HLA:HLA15778 DQB1*03:01:34 552 bp +bin2 HLA:HLA14778 100.000 270 HLA:HLA14778 DQB1*03:01:33 552 bp +bin2 HLA:HLA14637 100.000 270 HLA:HLA14637 DQB1*03:01:32 552 bp +bin2 HLA:HLA13622 100.000 270 HLA:HLA13622 DQB1*03:01:30 552 bp +bin2 HLA:HLA09772 100.000 270 HLA:HLA09772 DQB1*03:01:19 552 bp +bin2 HLA:HLA09094 100.000 270 HLA:HLA09094 DQB1*03:01:07 552 bp +bin2 HLA:HLA06142 100.000 270 HLA:HLA06142 DQB1*03:01:06 552 bp diff -r 000000000000 -r 10a407fb5072 test-data/sideB1.blast.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sideB1.blast.tsv Wed May 30 07:50:21 2018 -0400 @@ -0,0 +1,14 @@ +QuerySeq SubjectSeq Identity Length Info +bin1 HLA:HLA09875 100.000 249 HLA:HLA09875 DQA1*01:12 531 bp +bin1 HLA:HLA11066 100.000 249 HLA:HLA11066 DQA1*01:05:02 768 bp +bin1 HLA:HLA00606 100.000 249 HLA:HLA00606 DQA1*01:05:01 768 bp +bin1 HLA:HLA01376 100.000 249 HLA:HLA01376 DQA1*01:04:02 768 bp +bin1 HLA:HLA14793 100.000 249 HLA:HLA14793 DQA1*01:04:01:04 768 bp +bin1 HLA:HLA14792 100.000 249 HLA:HLA14792 DQA1*01:04:01:03 768 bp +bin1 HLA:HLA06597 100.000 249 HLA:HLA06597 DQA1*01:04:01:02 768 bp +bin1 HLA:HLA00605 100.000 249 HLA:HLA00605 DQA1*01:04:01:01 768 bp +bin1 HLA:HLA01409 100.000 249 HLA:HLA01409 DQA1*01:01:02 768 bp +bin1 HLA:HLA17305 100.000 249 HLA:HLA17305 DQA1*01:01:01:05 768 bp +bin1 HLA:HLA14787 100.000 249 HLA:HLA14787 DQA1*01:01:01:03 768 bp +bin1 HLA:HLA14786 100.000 249 HLA:HLA14786 DQA1*01:01:01:02 768 bp +bin1 HLA:HLA00601 100.000 249 HLA:HLA00601 DQA1*01:01:01:01 768 bp diff -r 000000000000 -r 10a407fb5072 test-data/sideB2.blast.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sideB2.blast.tsv Wed May 30 07:50:21 2018 -0400 @@ -0,0 +1,9 @@ +QuerySeq SubjectSeq Identity Length Info +bin2 HLA:HLA17290 100.000 249 HLA:HLA17290 DQA1*03:04 768 bp +bin2 HLA:HLA07419 100.000 249 HLA:HLA07419 DQA1*03:03:02 768 bp +bin2 HLA:HLA14797 100.000 249 HLA:HLA14797 DQA1*03:03:01:03 768 bp +bin2 HLA:HLA14795 100.000 249 HLA:HLA14795 DQA1*03:03:01:02 768 bp +bin2 HLA:HLA00611 100.000 249 HLA:HLA00611 DQA1*03:03:01:01 768 bp +bin2 HLA:HLA17309 100.000 249 HLA:HLA17309 DQA1*03:02:01:02 768 bp +bin2 HLA:HLA00610 100.000 249 HLA:HLA00610 DQA1*03:02:01:01 768 bp +bin2 HLA:HLA00608 100.000 249 HLA:HLA00608 DQA1*03:01:01 768 bp diff -r 000000000000 -r 10a407fb5072 test-data/test1_results.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test1_results.tsv Wed May 30 07:50:21 2018 -0400 @@ -0,0 +1,3 @@ + B1 B2 +A1 - - +A2 - DQ8 diff -r 000000000000 -r 10a407fb5072 test-data/test2_results.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test2_results.tsv Wed May 30 07:50:21 2018 -0400 @@ -0,0 +1,4 @@ + B1 B2 B3 +A1 - - DQ2.2 +A2 - DQ8 DQ8 +A3 - DQ8 DQ2.2;DQ2.3;DQ2.5;DQ8