Mercurial > repos > galaxyp > percolator
diff nested_collection.py @ 1:86770eea5b09 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
author | galaxyp |
---|---|
date | Sat, 04 Mar 2017 20:36:03 -0500 |
parents | |
children | 7a0951d0e13e |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/nested_collection.py Sat Mar 04 20:36:03 2017 -0500 @@ -0,0 +1,52 @@ +import argparse +import os +import re +from collections import OrderedDict + + +def get_filename_index_with_identifier(realnames, pool_id): + pool_indices = [] + for index, fn in enumerate(realnames): + if re.search(pool_id, fn) is not None: + pool_indices.append(index) + return pool_indices + + +def get_batches_of_galaxyfiles(realnames, batchsize, pool_ids): + """For an amount of input files, pool identifiers and a batch size, + return batches of files for a list of lists""" + if pool_ids: + filegroups = OrderedDict([(p_id, get_filename_index_with_identifier( + realnames, p_id)) for p_id in pool_ids]) + else: + filegroups = {1: range(len(realnames))} + batch = [] + for pool_id, grouped_indices in filegroups.items(): + if pool_id == 1: + pool_id = 'pool0' + for index in grouped_indices: + batch.append(index) + if batchsize and len(batch) == int(batchsize): + yield pool_id, batch + batch = [] + if len(batch) > 0: + yield pool_id, batch + batch = [] + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--batchsize', dest='batchsize', default=False) + parser.add_argument('--real-names', dest='realnames', nargs='+') + parser.add_argument('--galaxy-files', dest='galaxyfiles', nargs='+') + parser.add_argument('--pool-ids', dest='poolids', nargs='+', default=False) + args = parser.parse_args() + for batchcount, (pool_id, batch) in enumerate(get_batches_of_galaxyfiles( + args.realnames, args.batchsize, args.poolids)): + for fncount, batchfile in enumerate([args.galaxyfiles[index] for index in batch]): + dsetname = '{}___batch{}_inputfn{}.mzid'.format(pool_id, batchcount, fncount) + print('producing', dsetname) + os.symlink(batchfile, dsetname) + +if __name__ == '__main__': + main()