annotate nested_collection.py @ 6:07107a686ce9 draft default tip

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit de8cdf895c3c6113f301a119788701b2465a1b1b"
author galaxyp
date Thu, 13 Aug 2020 03:53:33 -0400
parents 154147805a33
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
1 import argparse
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
2 import os
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
3 import re
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
4 from collections import OrderedDict
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
5
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
6
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
7 def get_filename_index_with_identifier(realnames, pool_id):
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
8 pool_indices = []
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
9 for index, fn in enumerate(realnames):
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
10 if re.search(pool_id, fn) is not None:
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
11 pool_indices.append(index)
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
12 return pool_indices
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
13
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
14
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
15 def get_batches_of_galaxyfiles(realnames, batchsize, pool_ids):
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
16 """For an amount of input files, pool identifiers and a batch size,
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
17 return batches of files for a list of lists"""
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
18 if pool_ids:
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
19 filegroups = OrderedDict([(p_id, get_filename_index_with_identifier(
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
20 realnames, p_id)) for p_id in pool_ids])
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
21 else:
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
22 filegroups = {1: range(len(realnames))}
4
154147805a33 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents: 3
diff changeset
23 batch, in_pool_indices = [], []
1
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
24 for pool_id, grouped_indices in filegroups.items():
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
25 if pool_id == 1:
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
26 pool_id = 'pool0'
4
154147805a33 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents: 3
diff changeset
27 for in_pool_index, total_index in enumerate(grouped_indices):
154147805a33 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents: 3
diff changeset
28 batch.append(total_index)
154147805a33 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents: 3
diff changeset
29 in_pool_indices.append(in_pool_index)
1
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
30 if batchsize and len(batch) == int(batchsize):
4
154147805a33 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents: 3
diff changeset
31 yield pool_id, batch, in_pool_indices
154147805a33 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents: 3
diff changeset
32 batch, in_pool_indices = [], []
1
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
33 if len(batch) > 0:
4
154147805a33 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents: 3
diff changeset
34 yield pool_id, batch, in_pool_indices
154147805a33 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents: 3
diff changeset
35 batch, in_pool_indices = [], []
1
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
36
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
37
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
38 def main():
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
39 parser = argparse.ArgumentParser()
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
40 parser.add_argument('--batchsize', dest='batchsize', default=False)
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
41 parser.add_argument('--real-names', dest='realnames', nargs='+')
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
42 parser.add_argument('--galaxy-files', dest='galaxyfiles', nargs='+')
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
43 parser.add_argument('--pool-ids', dest='poolids', nargs='+', default=False)
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
44 args = parser.parse_args()
4
154147805a33 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents: 3
diff changeset
45 batches = [x for x in get_batches_of_galaxyfiles(args.realnames, args.batchsize, args.poolids)]
154147805a33 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents: 3
diff changeset
46 batchdigits = len(str(len(batches)))
154147805a33 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents: 3
diff changeset
47 if args.poolids:
154147805a33 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents: 3
diff changeset
48 pooldigits = {pid: [] for pid in args.poolids}
154147805a33 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents: 3
diff changeset
49 for batchdata in batches:
154147805a33 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents: 3
diff changeset
50 pooldigits[batchdata[0]].append(len(batchdata[1]))
154147805a33 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents: 3
diff changeset
51 pooldigits = {pid: len(str(sum(batchlengths))) for pid, batchlengths in pooldigits.items()}
154147805a33 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents: 3
diff changeset
52 else:
154147805a33 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents: 3
diff changeset
53 pooldigits = {'pool0': len(str(len(args.galaxyfiles)))}
154147805a33 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents: 3
diff changeset
54 for batchcount, (pool_id, batch, in_pool_indices) in enumerate(batches):
154147805a33 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents: 3
diff changeset
55 for fnindex, in_pool_index in zip(batch, in_pool_indices):
154147805a33 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents: 3
diff changeset
56 dsetname = '{pid}_batch{bi:0{bd}d}___inputfn{fi:0{pd}d}_{real}.data'.format(pid=pool_id, bi=batchcount, bd=batchdigits, fi=in_pool_index, pd=pooldigits[pool_id], real=args.realnames[fnindex])
1
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
57 print('producing', dsetname)
4
154147805a33 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 9db2c1bb610ff3a6940f0a037c0fccf337692c36
galaxyp
parents: 3
diff changeset
58 os.symlink(args.galaxyfiles[fnindex], dsetname)
1
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
59
6
07107a686ce9 "planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit de8cdf895c3c6113f301a119788701b2465a1b1b"
galaxyp
parents: 4
diff changeset
60
1
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
61 if __name__ == '__main__':
86770eea5b09 planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tools/percolator commit 0a5f9eb82877545be1c924357e585b17e01cfd1c
galaxyp
parents:
diff changeset
62 main()