changeset 0:10a407fb5072 draft

planemo upload for repository https://github.com/ErasmusMC-Bioinformatics/galaxytools-emc/tree/master/tools/hla_dq commit d6273a8247a1cbb7df2b26b9e97cd1bd3faa4f61
author erasmus-medical-center
date Wed, 30 May 2018 07:50:21 -0400
parents
children 4fc47a3ff9e8
files hla_dq.py hla_dq.xml test-data/all_sideA3.blast.tsv test-data/all_sideB3.blast.tsv test-data/sideA1.blast.tsv test-data/sideA2.blast.tsv test-data/sideB1.blast.tsv test-data/sideB2.blast.tsv test-data/test1_results.tsv test-data/test2_results.tsv
diffstat 10 files changed, 301 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hla_dq.py	Wed May 30 07:50:21 2018 -0400
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+'''
+Given HLA-A and HLA-B genes annotated with BLAST IMGT/HLA database
+genotypes, determine associated serotypes
+
+
+DQA1 | DQB1 | type
+--------------------
+02:01| 02:02| DQ2.2
+03:03| 02:02| DQ2.3
+05:01| 02:01| DQ2.3
+03:01| 03:02| DQ8
+03:02| 03:02| DQ8
+03:03| 03:02| DQ8
+
+Annotations are of form DQ[A|B]1*xx:yy[:zz[:vv]] where xx:yy is of interest.
+
+Example: "HLA:HLA11066 DQA1*01:05:02 768 bp"
+'''
+
+import argparse
+import itertools
+
+
+def to_matrix(l, n):
+    return [l[i:i+n] for i in range(0, len(l), n)]
+
+
+def get_list_of_associated_types(filename, column):
+    with open(filename) as f:
+        contents = f.readlines()
+
+    return [':'.join(line.split('\t')[column-1].split(' ')[1]
+            .split('*')[1].split(':')[0:2])
+            for line in contents[1:]]
+
+
+def get_associations(typesA, typesB):
+    ''' Given list of genotype annotations (e.g. DQA1*02:01..)
+        from A and B, determine possible associated serotypes
+    '''
+
+    ''' each combination of DQA1,DQB1,type '''
+    associated_combinations = [
+        ['02:01', '02:02', 'DQ2.2'],
+        ['03:03', '02:02', 'DQ2.3'],
+        ['05:01', '02:01', 'DQ2.5'],
+        ['03:01', '03:02', 'DQ8'],
+        ['03:02', '03:02', 'DQ8'],
+        ['03:03', '03:02', 'DQ8']]
+
+    return [a[2] for a in associated_combinations
+            if a[0] in typesA and a[1] in typesB]
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-A', required=True, action='append',
+                        help='BLAST hits A gene')
+    parser.add_argument('-B', required=True, action='append',
+                        help='BLAST hits B gene')
+    parser.add_argument('-c', '--column', default=5, type=int,
+                        help='Column number containing the BLAST annotation')
+    args = parser.parse_args()
+
+    # TODO: QC check that file A contains DQA1 annotations and
+    # B file contains DQB1 annotations?
+
+    # find possible associated types, for all combinations of alleles
+    typesA = [get_list_of_associated_types(A, args.column) for A in args.A]
+    typesB = [get_list_of_associated_types(B, args.column) for B in args.B]
+    associations = [get_associations(c[0], c[1])
+                    for c in itertools.product(typesA, typesB)]
+    associations = to_matrix(associations, len(args.B))
+
+    # write output table
+    header = '\t'+'\t'.join(['B'+str(i+1) for i in range(0, len(args.B))])+'\n'
+    bcount = 0
+    with open('results.tsv', 'w') as outfile:
+        outfile.write(header)
+        for line in associations:
+            bcount += 1
+            outfile.write(
+                'A' + str(bcount) + '\t' +
+                '\t'.join([';'.join(sorted(set(l)))
+                          if l else '-' for l in line])+'\n')
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hla_dq.xml	Wed May 30 07:50:21 2018 -0400
@@ -0,0 +1,94 @@
+<tool id="hla_dq" name="HLA-DQ typing" version="1.0">
+    <description> Determine possible associated types given BLAST IMGT/HLA annotation</description>
+    <command detect_errors="exit_code"><![CDATA[
+python '$__tool_directory__/hla_dq.py'
+#for $i in $filesA:
+    -A '$i.A'
+#end for
+#for $i in $filesB:
+    -B '$i.B'
+#end for
+#if $column and $column is not None:
+    --column $column
+#end if
+    ]]></command>
+    <inputs>
+        <!-- use repeats cuz order matters -->
+        <repeat name="filesA" title="BLAST results A gene sequences" min="1">
+            <param argument="-A" label="BLAST IMGT/HLA hits A gene" type="data" format="tabular"/>
+        </repeat>
+        <repeat name="filesB" title="BLAST results B gene sequences" min="1">
+            <param argument="-B" label="BLAST IMGT/HLA hits B gene" type="data" format="tabular"/>
+        </repeat>
+        <param argument="--column" label="Column number containing the BLAST IMGT/HLA genotype annotation" type="integer" value="5" min="1" help="Example annotation: HLA:HLA11066 DQA1*01:05:02 768 bp"/>
+    </inputs>
+    <outputs>
+        <data name="serotype_table" format="tabular" from_work_dir="results.tsv" label="${tool.name} on ${on_string}: Serotype table"/>
+    </outputs>
+    <tests>
+        <test><!-- test with real data -->
+            <repeat name="filesA">
+                <param name="A" value="sideA1.blast.tsv"/>
+            </repeat>
+            <repeat name="filesA">
+                <param name="A" value="sideA2.blast.tsv"/>
+            </repeat>
+            <repeat name="filesB">
+                <param name="B" value="sideB1.blast.tsv"/>
+            </repeat>
+            <repeat name="filesB">
+                <param name="B" value="sideB2.blast.tsv"/>
+            </repeat>
+            <output name="serotype_table" file="test1_results.tsv"/>
+        </test>
+        <test><!-- test three files each and data leading to serotypes -->
+            <repeat name="filesA">
+                <param name="A" value="sideA1.blast.tsv"/>
+            </repeat>
+            <repeat name="filesA">
+                <param name="A" value="sideA2.blast.tsv"/>
+            </repeat>
+            <repeat name="filesA">
+                <param name="A" value="all_sideA3.blast.tsv"/>
+            </repeat>
+            <repeat name="filesB">
+                <param name="B" value="sideB1.blast.tsv"/>
+            </repeat>
+            <repeat name="filesB">
+                <param name="B" value="sideB2.blast.tsv"/>
+            </repeat>
+             <repeat name="filesB">
+                <param name="B" value="all_sideB3.blast.tsv"/>
+            </repeat>
+            <output name="serotype_table" file="test2_results.tsv"/>
+        </test>
+    </tests>
+
+    <help><![CDATA[
+
+Given files annotated with BLAST and IMGT/HLA database for HLA-A and HLA-B gene (one or more files each), determine possible associated serotypes.
+
+===== ===== =====
+DQA1  DQB1  type
+===== ===== =====
+02:01 02:02 DQ2.2
+03:03 02:02 DQ2.3
+05:01 02:01 DQ2.3
+03:01 03:02 DQ8
+03:02 03:02 DQ8
+03:03 03:02 DQ8
+===== ===== =====
+
+Example result table:
+
+==== ==== =========
+type A1   A2
+==== ==== =========
+B1   DQ8  None
+B2   None DQ2.2;DQ8
+==== ==== =========
+
+Where A1, A2 correspond to two annotated input files given for gene A, each representing possible allele sequence
+
+    ]]></help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/all_sideA3.blast.tsv	Wed May 30 07:50:21 2018 -0400
@@ -0,0 +1,7 @@
+QuerySeq	SubjectSeq	Identity	Length	Info
+bin1	HLA:HLA10211	100.000	271	HLA:HLA10211 DQA1*02:01 786 bp
+bin1	HLA:HLA10210	100.000	271	HLA:HLA10210 DQA1*03:03 786 bp
+bin1	HLA:HLA18123	100.000	271	HLA:HLA18123 DQA1*05:01 677 bp
+bin1	HLA:HLA16190	100.000	271	HLA:HLA16190 DQA1*03:01 786 bp
+bin1	HLA:HLA16160	100.000	271	HLA:HLA16160 DQA1*03:02 786 bp
+bin1	HLA:HLA15585	100.000	271	HLA:HLA15585 DQA1*03:03 786 bp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/all_sideB3.blast.tsv	Wed May 30 07:50:21 2018 -0400
@@ -0,0 +1,4 @@
+QuerySeq	SubjectSeq	Identity	Length	Info
+bin1	HLA:HLA10211	100.000	271	HLA:HLA10211 DQB1*02:02 786 bp
+bin1	HLA:HLA10210	100.000	271	HLA:HLA10210 DQB1*02:01 786 bp
+bin1	HLA:HLA18123	100.000	271	HLA:HLA18123 DQB1*03:02 677 bp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sideA1.blast.tsv	Wed May 30 07:50:21 2018 -0400
@@ -0,0 +1,16 @@
+QuerySeq	SubjectSeq	Identity	Length	Info
+bin1	HLA:HLA16140	100.000	271	HLA:HLA16140 DQB1*02:83 786 bp
+bin1	HLA:HLA13925	100.000	271	HLA:HLA13925 DQB1*02:63 677 bp
+bin1	HLA:HLA12878	100.000	271	HLA:HLA12878 DQB1*02:53Q 783 bp
+bin1	HLA:HLA00622	100.000	271	HLA:HLA00622 DQB1*02:01:01 786 bp
+bin1	HLA:HLA14955	100.000	270	HLA:HLA14955 DQB1*02:72 552 bp
+bin1	HLA:HLA13230	100.000	270	HLA:HLA13230 DQB1*02:57 552 bp
+bin1	HLA:HLA09344	100.000	270	HLA:HLA09344 DQB1*02:27 552 bp
+bin1	HLA:HLA09245	100.000	270	HLA:HLA09245 DQB1*02:14:01 552 bp
+bin1	HLA:HLA08916	100.000	270	HLA:HLA08916 DQB1*02:08 552 bp
+bin1	HLA:HLA08083	100.000	270	HLA:HLA08083 DQB1*02:07:01 552 bp
+bin1	HLA:HLA15028	100.000	270	HLA:HLA15028 DQB1*02:01:24 552 bp
+bin1	HLA:HLA08915	100.000	270	HLA:HLA08915 DQB1*02:01:07 552 bp
+bin1	HLA:HLA08777	100.000	270	HLA:HLA08777 DQB1*02:01:06 552 bp
+bin1	HLA:HLA08310	100.000	270	HLA:HLA08310 DQB1*02:01:05 552 bp
+bin1	HLA:HLA05923	100.000	270	HLA:HLA05923 DQB1*02:01:04 552 bp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sideA2.blast.tsv	Wed May 30 07:50:21 2018 -0400
@@ -0,0 +1,64 @@
+QuerySeq	SubjectSeq	Identity	Length	Info
+bin2	HLA:HLA10211	100.000	271	HLA:HLA10211 DQB1*03:93 786 bp
+bin2	HLA:HLA10210	100.000	271	HLA:HLA10210 DQB1*03:92 786 bp
+bin2	HLA:HLA18123	100.000	271	HLA:HLA18123 DQB1*03:276N 677 bp
+bin2	HLA:HLA16190	100.000	271	HLA:HLA16190 DQB1*03:254 786 bp
+bin2	HLA:HLA16160	100.000	271	HLA:HLA16160 DQB1*03:252 786 bp
+bin2	HLA:HLA15585	100.000	271	HLA:HLA15585 DQB1*03:243 786 bp
+bin2	HLA:HLA15476	100.000	271	HLA:HLA15476 DQB1*03:231 663 bp
+bin2	HLA:HLA13930	100.000	271	HLA:HLA13930 DQB1*03:197Q 780 bp
+bin2	HLA:HLA02208	100.000	271	HLA:HLA02208 DQB1*03:16 677 bp
+bin2	HLA:HLA01574	100.000	271	HLA:HLA01574 DQB1*03:12 786 bp
+bin2	HLA:HLA17695	100.000	271	HLA:HLA17695 DQB1*03:10:02:02 786 bp
+bin2	HLA:HLA10276	100.000	271	HLA:HLA10276 DQB1*03:10:02:01 786 bp
+bin2	HLA:HLA01164	100.000	271	HLA:HLA01164 DQB1*03:10:01 690 bp
+bin2	HLA:HLA00630	100.000	271	HLA:HLA00630 DQB1*03:04:01 786 bp
+bin2	HLA:HLA16168	100.000	271	HLA:HLA16168 DQB1*03:01:37 786 bp
+bin2	HLA:HLA10540	100.000	271	HLA:HLA10540 DQB1*03:01:22 786 bp
+bin2	HLA:HLA10209	100.000	271	HLA:HLA10209 DQB1*03:01:21 786 bp
+bin2	HLA:HLA09634	100.000	271	HLA:HLA09634 DQB1*03:01:17 786 bp
+bin2	HLA:HLA09096	100.000	271	HLA:HLA09096 DQB1*03:01:08 786 bp
+bin2	HLA:HLA02688	100.000	271	HLA:HLA02688 DQB1*03:01:03 786 bp
+bin2	HLA:HLA18096	100.000	271	HLA:HLA18096 DQB1*03:01:01:20 786 bp
+bin2	HLA:HLA17597	100.000	271	HLA:HLA17597 DQB1*03:01:01:19 786 bp
+bin2	HLA:HLA17466	100.000	271	HLA:HLA17466 DQB1*03:01:01:18 786 bp
+bin2	HLA:HLA17462	100.000	271	HLA:HLA17462 DQB1*03:01:01:17 786 bp
+bin2	HLA:HLA17461	100.000	271	HLA:HLA17461 DQB1*03:01:01:16 786 bp
+bin2	HLA:HLA17460	100.000	271	HLA:HLA17460 DQB1*03:01:01:15 786 bp
+bin2	HLA:HLA17369	100.000	271	HLA:HLA17369 DQB1*03:01:01:14 786 bp
+bin2	HLA:HLA17367	100.000	271	HLA:HLA17367 DQB1*03:01:01:12 786 bp
+bin2	HLA:HLA17366	100.000	271	HLA:HLA17366 DQB1*03:01:01:11 786 bp
+bin2	HLA:HLA17365	100.000	271	HLA:HLA17365 DQB1*03:01:01:10 786 bp
+bin2	HLA:HLA17335	100.000	271	HLA:HLA17335 DQB1*03:01:01:09 786 bp
+bin2	HLA:HLA17197	100.000	271	HLA:HLA17197 DQB1*03:01:01:08 786 bp
+bin2	HLA:HLA17167	100.000	271	HLA:HLA17167 DQB1*03:01:01:07 786 bp
+bin2	HLA:HLA17162	100.000	271	HLA:HLA17162 DQB1*03:01:01:06 786 bp
+bin2	HLA:HLA15507	100.000	271	HLA:HLA15507 DQB1*03:01:01:05 786 bp
+bin2	HLA:HLA15506	100.000	271	HLA:HLA15506 DQB1*03:01:01:04 786 bp
+bin2	HLA:HLA06616	100.000	271	HLA:HLA06616 DQB1*03:01:01:03 786 bp
+bin2	HLA:HLA06613	100.000	271	HLA:HLA06613 DQB1*03:01:01:02 786 bp
+bin2	HLA:HLA00625	100.000	271	HLA:HLA00625 DQB1*03:01:01:01 786 bp
+bin2	HLA:HLA09837	100.000	270	HLA:HLA09837 DQB1*03:83 552 bp
+bin2	HLA:HLA09654	100.000	270	HLA:HLA09654 DQB1*03:80 552 bp
+bin2	HLA:HLA09566	100.000	270	HLA:HLA09566 DQB1*03:73 552 bp
+bin2	HLA:HLA09095	100.000	270	HLA:HLA09095 DQB1*03:47 552 bp
+bin2	HLA:HLA09093	100.000	270	HLA:HLA09093 DQB1*03:46 552 bp
+bin2	HLA:HLA08289	100.000	270	HLA:HLA08289 DQB1*03:44 552 bp
+bin2	HLA:HLA06177	100.000	270	HLA:HLA06177 DQB1*03:36 552 bp
+bin2	HLA:HLA05767	100.000	270	HLA:HLA05767 DQB1*03:28 552 bp
+bin2	HLA:HLA05374	100.000	270	HLA:HLA05374 DQB1*03:27 552 bp
+bin2	HLA:HLA15599	100.000	270	HLA:HLA15599 DQB1*03:242 552 bp
+bin2	HLA:HLA13500	100.000	270	HLA:HLA13500 DQB1*03:186 552 bp
+bin2	HLA:HLA11586	100.000	270	HLA:HLA11586 DQB1*03:139 552 bp
+bin2	HLA:HLA01589	100.000	270	HLA:HLA01589 DQB1*03:13 552 bp
+bin2	HLA:HLA11128	100.000	270	HLA:HLA11128 DQB1*03:119 552 bp
+bin2	HLA:HLA10928	100.000	270	HLA:HLA10928 DQB1*03:114 552 bp
+bin2	HLA:HLA10663	100.000	270	HLA:HLA10663 DQB1*03:113 552 bp
+bin2	HLA:HLA17148	100.000	270	HLA:HLA17148 DQB1*03:01:38 552 bp
+bin2	HLA:HLA15778	100.000	270	HLA:HLA15778 DQB1*03:01:34 552 bp
+bin2	HLA:HLA14778	100.000	270	HLA:HLA14778 DQB1*03:01:33 552 bp
+bin2	HLA:HLA14637	100.000	270	HLA:HLA14637 DQB1*03:01:32 552 bp
+bin2	HLA:HLA13622	100.000	270	HLA:HLA13622 DQB1*03:01:30 552 bp
+bin2	HLA:HLA09772	100.000	270	HLA:HLA09772 DQB1*03:01:19 552 bp
+bin2	HLA:HLA09094	100.000	270	HLA:HLA09094 DQB1*03:01:07 552 bp
+bin2	HLA:HLA06142	100.000	270	HLA:HLA06142 DQB1*03:01:06 552 bp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sideB1.blast.tsv	Wed May 30 07:50:21 2018 -0400
@@ -0,0 +1,14 @@
+QuerySeq	SubjectSeq	Identity	Length	Info
+bin1	HLA:HLA09875	100.000	249	HLA:HLA09875 DQA1*01:12 531 bp
+bin1	HLA:HLA11066	100.000	249	HLA:HLA11066 DQA1*01:05:02 768 bp
+bin1	HLA:HLA00606	100.000	249	HLA:HLA00606 DQA1*01:05:01 768 bp
+bin1	HLA:HLA01376	100.000	249	HLA:HLA01376 DQA1*01:04:02 768 bp
+bin1	HLA:HLA14793	100.000	249	HLA:HLA14793 DQA1*01:04:01:04 768 bp
+bin1	HLA:HLA14792	100.000	249	HLA:HLA14792 DQA1*01:04:01:03 768 bp
+bin1	HLA:HLA06597	100.000	249	HLA:HLA06597 DQA1*01:04:01:02 768 bp
+bin1	HLA:HLA00605	100.000	249	HLA:HLA00605 DQA1*01:04:01:01 768 bp
+bin1	HLA:HLA01409	100.000	249	HLA:HLA01409 DQA1*01:01:02 768 bp
+bin1	HLA:HLA17305	100.000	249	HLA:HLA17305 DQA1*01:01:01:05 768 bp
+bin1	HLA:HLA14787	100.000	249	HLA:HLA14787 DQA1*01:01:01:03 768 bp
+bin1	HLA:HLA14786	100.000	249	HLA:HLA14786 DQA1*01:01:01:02 768 bp
+bin1	HLA:HLA00601	100.000	249	HLA:HLA00601 DQA1*01:01:01:01 768 bp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sideB2.blast.tsv	Wed May 30 07:50:21 2018 -0400
@@ -0,0 +1,9 @@
+QuerySeq	SubjectSeq	Identity	Length	Info
+bin2	HLA:HLA17290	100.000	249	HLA:HLA17290 DQA1*03:04 768 bp
+bin2	HLA:HLA07419	100.000	249	HLA:HLA07419 DQA1*03:03:02 768 bp
+bin2	HLA:HLA14797	100.000	249	HLA:HLA14797 DQA1*03:03:01:03 768 bp
+bin2	HLA:HLA14795	100.000	249	HLA:HLA14795 DQA1*03:03:01:02 768 bp
+bin2	HLA:HLA00611	100.000	249	HLA:HLA00611 DQA1*03:03:01:01 768 bp
+bin2	HLA:HLA17309	100.000	249	HLA:HLA17309 DQA1*03:02:01:02 768 bp
+bin2	HLA:HLA00610	100.000	249	HLA:HLA00610 DQA1*03:02:01:01 768 bp
+bin2	HLA:HLA00608	100.000	249	HLA:HLA00608 DQA1*03:01:01 768 bp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test1_results.tsv	Wed May 30 07:50:21 2018 -0400
@@ -0,0 +1,3 @@
+	B1	B2
+A1	-	-
+A2	-	DQ8
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test2_results.tsv	Wed May 30 07:50:21 2018 -0400
@@ -0,0 +1,4 @@
+	B1	B2	B3
+A1	-	-	DQ2.2
+A2	-	DQ8	DQ8
+A3	-	DQ8	DQ2.2;DQ2.3;DQ2.5;DQ8