Mercurial > repos > erasmus-medical-center > hla_dq
changeset 0:10a407fb5072 draft
planemo upload for repository https://github.com/ErasmusMC-Bioinformatics/galaxytools-emc/tree/master/tools/hla_dq commit d6273a8247a1cbb7df2b26b9e97cd1bd3faa4f61
author | erasmus-medical-center |
---|---|
date | Wed, 30 May 2018 07:50:21 -0400 |
parents | |
children | 4fc47a3ff9e8 |
files | hla_dq.py hla_dq.xml test-data/all_sideA3.blast.tsv test-data/all_sideB3.blast.tsv test-data/sideA1.blast.tsv test-data/sideA2.blast.tsv test-data/sideB1.blast.tsv test-data/sideB2.blast.tsv test-data/test1_results.tsv test-data/test2_results.tsv |
diffstat | 10 files changed, 301 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hla_dq.py Wed May 30 07:50:21 2018 -0400 @@ -0,0 +1,86 @@ +#!/usr/bin/env python +''' +Given HLA-A and HLA-B genes annotated with BLAST IMGT/HLA database +genotypes, determine associated serotypes + + +DQA1 | DQB1 | type +-------------------- +02:01| 02:02| DQ2.2 +03:03| 02:02| DQ2.3 +05:01| 02:01| DQ2.3 +03:01| 03:02| DQ8 +03:02| 03:02| DQ8 +03:03| 03:02| DQ8 + +Annotations are of form DQ[A|B]1*xx:yy[:zz[:vv]] where xx:yy is of interest. + +Example: "HLA:HLA11066 DQA1*01:05:02 768 bp" +''' + +import argparse +import itertools + + +def to_matrix(l, n): + return [l[i:i+n] for i in range(0, len(l), n)] + + +def get_list_of_associated_types(filename, column): + with open(filename) as f: + contents = f.readlines() + + return [':'.join(line.split('\t')[column-1].split(' ')[1] + .split('*')[1].split(':')[0:2]) + for line in contents[1:]] + + +def get_associations(typesA, typesB): + ''' Given list of genotype annotations (e.g. DQA1*02:01..) + from A and B, determine possible associated serotypes + ''' + + ''' each combination of DQA1,DQB1,type ''' + associated_combinations = [ + ['02:01', '02:02', 'DQ2.2'], + ['03:03', '02:02', 'DQ2.3'], + ['05:01', '02:01', 'DQ2.5'], + ['03:01', '03:02', 'DQ8'], + ['03:02', '03:02', 'DQ8'], + ['03:03', '03:02', 'DQ8']] + + return [a[2] for a in associated_combinations + if a[0] in typesA and a[1] in typesB] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-A', required=True, action='append', + help='BLAST hits A gene') + parser.add_argument('-B', required=True, action='append', + help='BLAST hits B gene') + parser.add_argument('-c', '--column', default=5, type=int, + help='Column number containing the BLAST annotation') + args = parser.parse_args() + + # TODO: QC check that file A contains DQA1 annotations and + # B file contains DQB1 annotations? + + # find possible associated types, for all combinations of alleles + typesA = [get_list_of_associated_types(A, args.column) for A in args.A] + typesB = [get_list_of_associated_types(B, args.column) for B in args.B] + associations = [get_associations(c[0], c[1]) + for c in itertools.product(typesA, typesB)] + associations = to_matrix(associations, len(args.B)) + + # write output table + header = '\t'+'\t'.join(['B'+str(i+1) for i in range(0, len(args.B))])+'\n' + bcount = 0 + with open('results.tsv', 'w') as outfile: + outfile.write(header) + for line in associations: + bcount += 1 + outfile.write( + 'A' + str(bcount) + '\t' + + '\t'.join([';'.join(sorted(set(l))) + if l else '-' for l in line])+'\n')
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hla_dq.xml Wed May 30 07:50:21 2018 -0400 @@ -0,0 +1,94 @@ +<tool id="hla_dq" name="HLA-DQ typing" version="1.0"> + <description> Determine possible associated types given BLAST IMGT/HLA annotation</description> + <command detect_errors="exit_code"><![CDATA[ +python '$__tool_directory__/hla_dq.py' +#for $i in $filesA: + -A '$i.A' +#end for +#for $i in $filesB: + -B '$i.B' +#end for +#if $column and $column is not None: + --column $column +#end if + ]]></command> + <inputs> + <!-- use repeats cuz order matters --> + <repeat name="filesA" title="BLAST results A gene sequences" min="1"> + <param argument="-A" label="BLAST IMGT/HLA hits A gene" type="data" format="tabular"/> + </repeat> + <repeat name="filesB" title="BLAST results B gene sequences" min="1"> + <param argument="-B" label="BLAST IMGT/HLA hits B gene" type="data" format="tabular"/> + </repeat> + <param argument="--column" label="Column number containing the BLAST IMGT/HLA genotype annotation" type="integer" value="5" min="1" help="Example annotation: HLA:HLA11066 DQA1*01:05:02 768 bp"/> + </inputs> + <outputs> + <data name="serotype_table" format="tabular" from_work_dir="results.tsv" label="${tool.name} on ${on_string}: Serotype table"/> + </outputs> + <tests> + <test><!-- test with real data --> + <repeat name="filesA"> + <param name="A" value="sideA1.blast.tsv"/> + </repeat> + <repeat name="filesA"> + <param name="A" value="sideA2.blast.tsv"/> + </repeat> + <repeat name="filesB"> + <param name="B" value="sideB1.blast.tsv"/> + </repeat> + <repeat name="filesB"> + <param name="B" value="sideB2.blast.tsv"/> + </repeat> + <output name="serotype_table" file="test1_results.tsv"/> + </test> + <test><!-- test three files each and data leading to serotypes --> + <repeat name="filesA"> + <param name="A" value="sideA1.blast.tsv"/> + </repeat> + <repeat name="filesA"> + <param name="A" value="sideA2.blast.tsv"/> + </repeat> + <repeat name="filesA"> + <param name="A" value="all_sideA3.blast.tsv"/> + </repeat> + <repeat name="filesB"> + <param name="B" value="sideB1.blast.tsv"/> + </repeat> + <repeat name="filesB"> + <param name="B" value="sideB2.blast.tsv"/> + </repeat> + <repeat name="filesB"> + <param name="B" value="all_sideB3.blast.tsv"/> + </repeat> + <output name="serotype_table" file="test2_results.tsv"/> + </test> + </tests> + + <help><![CDATA[ + +Given files annotated with BLAST and IMGT/HLA database for HLA-A and HLA-B gene (one or more files each), determine possible associated serotypes. + +===== ===== ===== +DQA1 DQB1 type +===== ===== ===== +02:01 02:02 DQ2.2 +03:03 02:02 DQ2.3 +05:01 02:01 DQ2.3 +03:01 03:02 DQ8 +03:02 03:02 DQ8 +03:03 03:02 DQ8 +===== ===== ===== + +Example result table: + +==== ==== ========= +type A1 A2 +==== ==== ========= +B1 DQ8 None +B2 None DQ2.2;DQ8 +==== ==== ========= + +Where A1, A2 correspond to two annotated input files given for gene A, each representing possible allele sequence + + ]]></help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/all_sideA3.blast.tsv Wed May 30 07:50:21 2018 -0400 @@ -0,0 +1,7 @@ +QuerySeq SubjectSeq Identity Length Info +bin1 HLA:HLA10211 100.000 271 HLA:HLA10211 DQA1*02:01 786 bp +bin1 HLA:HLA10210 100.000 271 HLA:HLA10210 DQA1*03:03 786 bp +bin1 HLA:HLA18123 100.000 271 HLA:HLA18123 DQA1*05:01 677 bp +bin1 HLA:HLA16190 100.000 271 HLA:HLA16190 DQA1*03:01 786 bp +bin1 HLA:HLA16160 100.000 271 HLA:HLA16160 DQA1*03:02 786 bp +bin1 HLA:HLA15585 100.000 271 HLA:HLA15585 DQA1*03:03 786 bp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/all_sideB3.blast.tsv Wed May 30 07:50:21 2018 -0400 @@ -0,0 +1,4 @@ +QuerySeq SubjectSeq Identity Length Info +bin1 HLA:HLA10211 100.000 271 HLA:HLA10211 DQB1*02:02 786 bp +bin1 HLA:HLA10210 100.000 271 HLA:HLA10210 DQB1*02:01 786 bp +bin1 HLA:HLA18123 100.000 271 HLA:HLA18123 DQB1*03:02 677 bp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sideA1.blast.tsv Wed May 30 07:50:21 2018 -0400 @@ -0,0 +1,16 @@ +QuerySeq SubjectSeq Identity Length Info +bin1 HLA:HLA16140 100.000 271 HLA:HLA16140 DQB1*02:83 786 bp +bin1 HLA:HLA13925 100.000 271 HLA:HLA13925 DQB1*02:63 677 bp +bin1 HLA:HLA12878 100.000 271 HLA:HLA12878 DQB1*02:53Q 783 bp +bin1 HLA:HLA00622 100.000 271 HLA:HLA00622 DQB1*02:01:01 786 bp +bin1 HLA:HLA14955 100.000 270 HLA:HLA14955 DQB1*02:72 552 bp +bin1 HLA:HLA13230 100.000 270 HLA:HLA13230 DQB1*02:57 552 bp +bin1 HLA:HLA09344 100.000 270 HLA:HLA09344 DQB1*02:27 552 bp +bin1 HLA:HLA09245 100.000 270 HLA:HLA09245 DQB1*02:14:01 552 bp +bin1 HLA:HLA08916 100.000 270 HLA:HLA08916 DQB1*02:08 552 bp +bin1 HLA:HLA08083 100.000 270 HLA:HLA08083 DQB1*02:07:01 552 bp +bin1 HLA:HLA15028 100.000 270 HLA:HLA15028 DQB1*02:01:24 552 bp +bin1 HLA:HLA08915 100.000 270 HLA:HLA08915 DQB1*02:01:07 552 bp +bin1 HLA:HLA08777 100.000 270 HLA:HLA08777 DQB1*02:01:06 552 bp +bin1 HLA:HLA08310 100.000 270 HLA:HLA08310 DQB1*02:01:05 552 bp +bin1 HLA:HLA05923 100.000 270 HLA:HLA05923 DQB1*02:01:04 552 bp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sideA2.blast.tsv Wed May 30 07:50:21 2018 -0400 @@ -0,0 +1,64 @@ +QuerySeq SubjectSeq Identity Length Info +bin2 HLA:HLA10211 100.000 271 HLA:HLA10211 DQB1*03:93 786 bp +bin2 HLA:HLA10210 100.000 271 HLA:HLA10210 DQB1*03:92 786 bp +bin2 HLA:HLA18123 100.000 271 HLA:HLA18123 DQB1*03:276N 677 bp +bin2 HLA:HLA16190 100.000 271 HLA:HLA16190 DQB1*03:254 786 bp +bin2 HLA:HLA16160 100.000 271 HLA:HLA16160 DQB1*03:252 786 bp +bin2 HLA:HLA15585 100.000 271 HLA:HLA15585 DQB1*03:243 786 bp +bin2 HLA:HLA15476 100.000 271 HLA:HLA15476 DQB1*03:231 663 bp +bin2 HLA:HLA13930 100.000 271 HLA:HLA13930 DQB1*03:197Q 780 bp +bin2 HLA:HLA02208 100.000 271 HLA:HLA02208 DQB1*03:16 677 bp +bin2 HLA:HLA01574 100.000 271 HLA:HLA01574 DQB1*03:12 786 bp +bin2 HLA:HLA17695 100.000 271 HLA:HLA17695 DQB1*03:10:02:02 786 bp +bin2 HLA:HLA10276 100.000 271 HLA:HLA10276 DQB1*03:10:02:01 786 bp +bin2 HLA:HLA01164 100.000 271 HLA:HLA01164 DQB1*03:10:01 690 bp +bin2 HLA:HLA00630 100.000 271 HLA:HLA00630 DQB1*03:04:01 786 bp +bin2 HLA:HLA16168 100.000 271 HLA:HLA16168 DQB1*03:01:37 786 bp +bin2 HLA:HLA10540 100.000 271 HLA:HLA10540 DQB1*03:01:22 786 bp +bin2 HLA:HLA10209 100.000 271 HLA:HLA10209 DQB1*03:01:21 786 bp +bin2 HLA:HLA09634 100.000 271 HLA:HLA09634 DQB1*03:01:17 786 bp +bin2 HLA:HLA09096 100.000 271 HLA:HLA09096 DQB1*03:01:08 786 bp +bin2 HLA:HLA02688 100.000 271 HLA:HLA02688 DQB1*03:01:03 786 bp +bin2 HLA:HLA18096 100.000 271 HLA:HLA18096 DQB1*03:01:01:20 786 bp +bin2 HLA:HLA17597 100.000 271 HLA:HLA17597 DQB1*03:01:01:19 786 bp +bin2 HLA:HLA17466 100.000 271 HLA:HLA17466 DQB1*03:01:01:18 786 bp +bin2 HLA:HLA17462 100.000 271 HLA:HLA17462 DQB1*03:01:01:17 786 bp +bin2 HLA:HLA17461 100.000 271 HLA:HLA17461 DQB1*03:01:01:16 786 bp +bin2 HLA:HLA17460 100.000 271 HLA:HLA17460 DQB1*03:01:01:15 786 bp +bin2 HLA:HLA17369 100.000 271 HLA:HLA17369 DQB1*03:01:01:14 786 bp +bin2 HLA:HLA17367 100.000 271 HLA:HLA17367 DQB1*03:01:01:12 786 bp +bin2 HLA:HLA17366 100.000 271 HLA:HLA17366 DQB1*03:01:01:11 786 bp +bin2 HLA:HLA17365 100.000 271 HLA:HLA17365 DQB1*03:01:01:10 786 bp +bin2 HLA:HLA17335 100.000 271 HLA:HLA17335 DQB1*03:01:01:09 786 bp +bin2 HLA:HLA17197 100.000 271 HLA:HLA17197 DQB1*03:01:01:08 786 bp +bin2 HLA:HLA17167 100.000 271 HLA:HLA17167 DQB1*03:01:01:07 786 bp +bin2 HLA:HLA17162 100.000 271 HLA:HLA17162 DQB1*03:01:01:06 786 bp +bin2 HLA:HLA15507 100.000 271 HLA:HLA15507 DQB1*03:01:01:05 786 bp +bin2 HLA:HLA15506 100.000 271 HLA:HLA15506 DQB1*03:01:01:04 786 bp +bin2 HLA:HLA06616 100.000 271 HLA:HLA06616 DQB1*03:01:01:03 786 bp +bin2 HLA:HLA06613 100.000 271 HLA:HLA06613 DQB1*03:01:01:02 786 bp +bin2 HLA:HLA00625 100.000 271 HLA:HLA00625 DQB1*03:01:01:01 786 bp +bin2 HLA:HLA09837 100.000 270 HLA:HLA09837 DQB1*03:83 552 bp +bin2 HLA:HLA09654 100.000 270 HLA:HLA09654 DQB1*03:80 552 bp +bin2 HLA:HLA09566 100.000 270 HLA:HLA09566 DQB1*03:73 552 bp +bin2 HLA:HLA09095 100.000 270 HLA:HLA09095 DQB1*03:47 552 bp +bin2 HLA:HLA09093 100.000 270 HLA:HLA09093 DQB1*03:46 552 bp +bin2 HLA:HLA08289 100.000 270 HLA:HLA08289 DQB1*03:44 552 bp +bin2 HLA:HLA06177 100.000 270 HLA:HLA06177 DQB1*03:36 552 bp +bin2 HLA:HLA05767 100.000 270 HLA:HLA05767 DQB1*03:28 552 bp +bin2 HLA:HLA05374 100.000 270 HLA:HLA05374 DQB1*03:27 552 bp +bin2 HLA:HLA15599 100.000 270 HLA:HLA15599 DQB1*03:242 552 bp +bin2 HLA:HLA13500 100.000 270 HLA:HLA13500 DQB1*03:186 552 bp +bin2 HLA:HLA11586 100.000 270 HLA:HLA11586 DQB1*03:139 552 bp +bin2 HLA:HLA01589 100.000 270 HLA:HLA01589 DQB1*03:13 552 bp +bin2 HLA:HLA11128 100.000 270 HLA:HLA11128 DQB1*03:119 552 bp +bin2 HLA:HLA10928 100.000 270 HLA:HLA10928 DQB1*03:114 552 bp +bin2 HLA:HLA10663 100.000 270 HLA:HLA10663 DQB1*03:113 552 bp +bin2 HLA:HLA17148 100.000 270 HLA:HLA17148 DQB1*03:01:38 552 bp +bin2 HLA:HLA15778 100.000 270 HLA:HLA15778 DQB1*03:01:34 552 bp +bin2 HLA:HLA14778 100.000 270 HLA:HLA14778 DQB1*03:01:33 552 bp +bin2 HLA:HLA14637 100.000 270 HLA:HLA14637 DQB1*03:01:32 552 bp +bin2 HLA:HLA13622 100.000 270 HLA:HLA13622 DQB1*03:01:30 552 bp +bin2 HLA:HLA09772 100.000 270 HLA:HLA09772 DQB1*03:01:19 552 bp +bin2 HLA:HLA09094 100.000 270 HLA:HLA09094 DQB1*03:01:07 552 bp +bin2 HLA:HLA06142 100.000 270 HLA:HLA06142 DQB1*03:01:06 552 bp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sideB1.blast.tsv Wed May 30 07:50:21 2018 -0400 @@ -0,0 +1,14 @@ +QuerySeq SubjectSeq Identity Length Info +bin1 HLA:HLA09875 100.000 249 HLA:HLA09875 DQA1*01:12 531 bp +bin1 HLA:HLA11066 100.000 249 HLA:HLA11066 DQA1*01:05:02 768 bp +bin1 HLA:HLA00606 100.000 249 HLA:HLA00606 DQA1*01:05:01 768 bp +bin1 HLA:HLA01376 100.000 249 HLA:HLA01376 DQA1*01:04:02 768 bp +bin1 HLA:HLA14793 100.000 249 HLA:HLA14793 DQA1*01:04:01:04 768 bp +bin1 HLA:HLA14792 100.000 249 HLA:HLA14792 DQA1*01:04:01:03 768 bp +bin1 HLA:HLA06597 100.000 249 HLA:HLA06597 DQA1*01:04:01:02 768 bp +bin1 HLA:HLA00605 100.000 249 HLA:HLA00605 DQA1*01:04:01:01 768 bp +bin1 HLA:HLA01409 100.000 249 HLA:HLA01409 DQA1*01:01:02 768 bp +bin1 HLA:HLA17305 100.000 249 HLA:HLA17305 DQA1*01:01:01:05 768 bp +bin1 HLA:HLA14787 100.000 249 HLA:HLA14787 DQA1*01:01:01:03 768 bp +bin1 HLA:HLA14786 100.000 249 HLA:HLA14786 DQA1*01:01:01:02 768 bp +bin1 HLA:HLA00601 100.000 249 HLA:HLA00601 DQA1*01:01:01:01 768 bp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sideB2.blast.tsv Wed May 30 07:50:21 2018 -0400 @@ -0,0 +1,9 @@ +QuerySeq SubjectSeq Identity Length Info +bin2 HLA:HLA17290 100.000 249 HLA:HLA17290 DQA1*03:04 768 bp +bin2 HLA:HLA07419 100.000 249 HLA:HLA07419 DQA1*03:03:02 768 bp +bin2 HLA:HLA14797 100.000 249 HLA:HLA14797 DQA1*03:03:01:03 768 bp +bin2 HLA:HLA14795 100.000 249 HLA:HLA14795 DQA1*03:03:01:02 768 bp +bin2 HLA:HLA00611 100.000 249 HLA:HLA00611 DQA1*03:03:01:01 768 bp +bin2 HLA:HLA17309 100.000 249 HLA:HLA17309 DQA1*03:02:01:02 768 bp +bin2 HLA:HLA00610 100.000 249 HLA:HLA00610 DQA1*03:02:01:01 768 bp +bin2 HLA:HLA00608 100.000 249 HLA:HLA00608 DQA1*03:01:01 768 bp