Mercurial > repos > galaxyp > percolator
annotate nested_collection.py @ 5:dce55ca21b98 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 4fd46e83045d78a4703c0ae7d0cfc396bdbc8e78
author | galaxyp |
---|---|
date | Fri, 19 May 2017 09:01:27 -0400 |
parents | 154147805a33 |
children | 07107a686ce9 |
rev | line source |
---|---|
1
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
1 import argparse |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
2 import os |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
3 import re |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
4 from collections import OrderedDict |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
5 |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
6 |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
7 def get_filename_index_with_identifier(realnames, pool_id): |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
8 pool_indices = [] |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
9 for index, fn in enumerate(realnames): |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
10 if re.search(pool_id, fn) is not None: |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
11 pool_indices.append(index) |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
12 return pool_indices |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
13 |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
14 |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
15 def get_batches_of_galaxyfiles(realnames, batchsize, pool_ids): |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
16 """For an amount of input files, pool identifiers and a batch size, |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
17 return batches of files for a list of lists""" |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
18 if pool_ids: |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
19 filegroups = OrderedDict([(p_id, get_filename_index_with_identifier( |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
20 realnames, p_id)) for p_id in pool_ids]) |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
21 else: |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
22 filegroups = {1: range(len(realnames))} |
4
154147805a33
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents:
3
diff
changeset
|
23 batch, in_pool_indices = [], [] |
1
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
24 for pool_id, grouped_indices in filegroups.items(): |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
25 if pool_id == 1: |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
26 pool_id = 'pool0' |
4
154147805a33
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents:
3
diff
changeset
|
27 for in_pool_index, total_index in enumerate(grouped_indices): |
154147805a33
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents:
3
diff
changeset
|
28 batch.append(total_index) |
154147805a33
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents:
3
diff
changeset
|
29 in_pool_indices.append(in_pool_index) |
1
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
30 if batchsize and len(batch) == int(batchsize): |
4
154147805a33
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents:
3
diff
changeset
|
31 yield pool_id, batch, in_pool_indices |
154147805a33
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents:
3
diff
changeset
|
32 batch, in_pool_indices = [], [] |
1
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
33 if len(batch) > 0: |
4
154147805a33
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents:
3
diff
changeset
|
34 yield pool_id, batch, in_pool_indices |
154147805a33
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents:
3
diff
changeset
|
35 batch, in_pool_indices = [], [] |
1
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
36 |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
37 |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
38 def main(): |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
39 parser = argparse.ArgumentParser() |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
40 parser.add_argument('--batchsize', dest='batchsize', default=False) |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
41 parser.add_argument('--real-names', dest='realnames', nargs='+') |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
42 parser.add_argument('--galaxy-files', dest='galaxyfiles', nargs='+') |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
43 parser.add_argument('--pool-ids', dest='poolids', nargs='+', default=False) |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
44 args = parser.parse_args() |
4
154147805a33
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents:
3
diff
changeset
|
45 batches = [x for x in get_batches_of_galaxyfiles(args.realnames, args.batchsize, args.poolids)] |
154147805a33
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents:
3
diff
changeset
|
46 batchdigits = len(str(len(batches))) |
154147805a33
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents:
3
diff
changeset
|
47 if args.poolids: |
154147805a33
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents:
3
diff
changeset
|
48 pooldigits = {pid: [] for pid in args.poolids} |
154147805a33
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents:
3
diff
changeset
|
49 for batchdata in batches: |
154147805a33
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents:
3
diff
changeset
|
50 pooldigits[batchdata[0]].append(len(batchdata[1])) |
154147805a33
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents:
3
diff
changeset
|
51 pooldigits = {pid: len(str(sum(batchlengths))) for pid, batchlengths in pooldigits.items()} |
154147805a33
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents:
3
diff
changeset
|
52 else: |
154147805a33
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents:
3
diff
changeset
|
53 pooldigits = {'pool0': len(str(len(args.galaxyfiles)))} |
154147805a33
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents:
3
diff
changeset
|
54 for batchcount, (pool_id, batch, in_pool_indices) in enumerate(batches): |
154147805a33
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents:
3
diff
changeset
|
55 for fnindex, in_pool_index in zip(batch, in_pool_indices): |
154147805a33
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents:
3
diff
changeset
|
56 dsetname = '{pid}_batch{bi:0{bd}d}___inputfn{fi:0{pd}d}_{real}.data'.format(pid=pool_id, bi=batchcount, bd=batchdigits, fi=in_pool_index, pd=pooldigits[pool_id], real=args.realnames[fnindex]) |
1
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
57 print('producing', dsetname) |
4
154147805a33
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents:
3
diff
changeset
|
58 os.symlink(args.galaxyfiles[fnindex], dsetname) |
1
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
59 |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
60 if __name__ == '__main__': |
86770eea5b09
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff
changeset
|
61 main() |