comparison model_prediction.py @ 24:a5aed87b2cc0 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 5b2ac730ec6d3b762faa9034eddd19ad1b347476"
author bgruening
date Mon, 16 Dec 2019 05:28:32 -0500
parents 5895fe0b8bde
children 9b017b0da56e
comparison
equal deleted inserted replaced
23:56f6ebf69ddc 24:a5aed87b2cc0
1 import argparse 1 import argparse
2 import json 2 import json
3 import numpy as np 3 import numpy as np
4 import pandas as pd 4 import pandas as pd
5 import tabix
6 import warnings 5 import warnings
7 6
8 from scipy.io import mmread 7 from scipy.io import mmread
9 from sklearn.pipeline import Pipeline 8 from sklearn.pipeline import Pipeline
10 9
11 from galaxy_ml.externals.selene_sdk.sequences import Genome
12 from galaxy_ml.utils import (load_model, read_columns, 10 from galaxy_ml.utils import (load_model, read_columns,
13 get_module, try_get_attr) 11 get_module, try_get_attr)
14 12
15 13
16 N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1)) 14 N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1))
136 options['blacklist_regions'] = None 134 options['blacklist_regions'] = None
137 135
138 pred_data_generator = klass( 136 pred_data_generator = klass(
139 ref_genome_path=ref_seq, vcf_path=vcf_path, **options) 137 ref_genome_path=ref_seq, vcf_path=vcf_path, **options)
140 138
141 pred_data_generator.fit() 139 pred_data_generator.set_processing_attrs()
142 140
143 variants = pred_data_generator.variants 141 variants = pred_data_generator.variants
144 # TODO : remove the following block after galaxy-ml v0.7.13 142
145 blacklist_tabix = getattr(pred_data_generator.reference_genome_,
146 '_blacklist_tabix', None)
147 clean_variants = []
148 if blacklist_tabix:
149 start_radius = pred_data_generator.start_radius_
150 end_radius = pred_data_generator.end_radius_
151
152 for chrom, pos, name, ref, alt, strand in variants:
153 center = pos + len(ref) // 2
154 start = center - start_radius
155 end = center + end_radius
156
157 if isinstance(pred_data_generator.reference_genome_, Genome):
158 if "chr" not in chrom:
159 chrom = "chr" + chrom
160 if "MT" in chrom:
161 chrom = chrom[:-1]
162 try:
163 rows = blacklist_tabix.query(chrom, start, end)
164 found = 0
165 for row in rows:
166 found = 1
167 break
168 if found:
169 continue
170 except tabix.TabixError:
171 pass
172
173 clean_variants.append((chrom, pos, name, ref, alt, strand))
174 else:
175 clean_variants = variants
176
177 setattr(pred_data_generator, 'variants', clean_variants)
178
179 variants = np.array(clean_variants)
180 # predict 1600 sample at once then write to file 143 # predict 1600 sample at once then write to file
181 gen_flow = pred_data_generator.flow(batch_size=1600) 144 gen_flow = pred_data_generator.flow(batch_size=1600)
182 145
183 file_writer = open(outfile_predict, 'w') 146 file_writer = open(outfile_predict, 'w')
184 header_row = '\t'.join(['chrom', 'pos', 'name', 'ref', 147 header_row = '\t'.join(['chrom', 'pos', 'name', 'ref',