Mercurial > repos > bgruening > split_file_to_collection
annotate split_file_to_collection.py @ 8:6cbe2f30c2d7 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
author | bgruening |
---|---|
date | Sun, 12 Jul 2020 10:27:06 -0400 |
parents | 0046692724f9 |
children | baabc30154cd |
rev | line source |
---|---|
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
1 #!/usr/bin/env python |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
2 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
3 import argparse |
5
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
4 import math |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
5 import os |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
6 import random |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
7 import re |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
8 |
5
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
9 # configuration of the splitting for specific file types |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
10 # - regular expression matching the record separator ('' if not splitting by regex but by number of lines) |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
11 # - number of lines to split after (0 if not splitting by number of lines but regex) |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
12 # - a boolean indicating if the record separator is at the end of the record |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
13 # |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
14 # new file types can be added by appending to this dict, |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
15 # updating the parser, and adding a new type option in the Galaxy wrapper |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
16 FILETYPES = {'fasta': (r'^>', 0, False), |
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
17 'fastq': (r'', 4, False), |
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
18 'tabular': (r'', 1, False), |
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
19 'txt': (r'', 1, False), |
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
20 'mgf': (r'^BEGIN IONS', 0, False), |
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
21 'sdf': (r'\$\$\$\$', 0, True), |
4
0850f2dfba13
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 872590086696cfbc248519507ecb9063926297ad"
bgruening
parents:
3
diff
changeset
|
22 } |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
23 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
24 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
25 def main(): |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
26 ps = parser_cli() |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
27 args = vars(ps.parse_args()) |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
28 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
29 # get args and validate |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
30 in_file = args["in"] |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
31 if not os.path.isfile(args["in"]): |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
32 raise FileNotFoundError('Input file does not exist') |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
33 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
34 out_dir = args["out_dir"] |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
35 if not os.path.isdir(args["out_dir"]): |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
36 raise FileNotFoundError('out_dir is not a directory') |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
37 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
38 top = args["top"] |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
39 if top < 0: |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
40 raise ValueError("Number of header lines cannot be negative") |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
41 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
42 ftype = args["ftype"] |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
43 |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
44 assert ftype != "generic" or args["generic_re"] is not None, "--generic_re needs to be given for generic input" |
2
d150ac3d853d
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 06ffe450bafa280eee8a4331c9cfc9e1ece7c522"
bgruening
parents:
0
diff
changeset
|
45 |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
46 if args["ftype"] == "tabular" and args["by"] == "col": |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
47 args["match"] = replace_mapped_chars(args["match"]) |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
48 args["sub"] = replace_mapped_chars(args["sub"]) |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
49 split_by_column(args, in_file, out_dir, top) |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
50 else: |
5
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
51 args["generic_re"] = replace_mapped_chars(args["generic_re"]) |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
52 split_by_record(args, in_file, out_dir, top, ftype) |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
53 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
54 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
55 def parser_cli(): |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
56 parser = argparse.ArgumentParser(description="split a file into multiple files. " + |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
57 "Can split on the column of a tabular file, " + |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
58 "with custom and useful names based on column value.") |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
59 parser.add_argument('--in', '-i', required=True, help="The input file") |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
60 parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True) |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
61 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files") |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
62 parser.add_argument('--file_ext', '-e', help="If not splitting by column," + |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
63 " the extension of the new files (without a period)") |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
64 parser.add_argument('--ftype', '-f', help="The type of the file to split", required=True, |
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
65 choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"]) |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
66 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)", |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
67 default="row", choices=["col", "row"]) |
5
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
68 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files.") |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
69 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true') |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
70 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " + |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
71 "If not provided and args[\"rand\"]==True, then date is used", type=int) |
5
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
72 group = parser.add_mutually_exclusive_group() |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
73 group.add_argument('--numnew', '-n', type=int, default=1, |
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
74 help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.") |
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
75 group.add_argument('--chunksize', '-k', type=int, default=0, |
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
76 help="Number of records by file. Not valid for splitting on a column") |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
77 parser.add_argument('--batch', action='store_true', |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
78 help="Distribute files to collection while maintaining order. Ignored if splitting on column.") |
5
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
79 generic = parser.add_argument_group('Arguments controling generic splitting') |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
80 group = generic.add_mutually_exclusive_group() |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
81 group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required=False) |
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
82 group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required=False) |
5
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
83 generic.add_argument('--split_after', '-p', action='store_true', |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
84 help="Split between records after separator (default is before). " + |
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
85 "Only for generic splitting by regex - specific ftypes are always split in the default way") |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
86 bycol = parser.add_argument_group('If splitting on a column') |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
87 bycol.add_argument('--match', '-m', default="(.*)", help="The regular expression to match id column entries") |
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
88 bycol.add_argument('--sub', '-s', default=r'\1', |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
89 help="The regular expression to substitute in for the matched pattern.") |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
90 bycol.add_argument('--id_column', '-c', default="1", |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
91 help="Column that is used to name output files. Indexed starting from 1.", type=int) |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
92 return parser |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
93 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
94 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
95 def replace_mapped_chars(pattern): |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
96 """ |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
97 handles special escaped characters when coming from galaxy |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
98 """ |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
99 mapped_chars = {'\'': '__sq__', '\\': '__backslash__'} |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
100 for key, value in mapped_chars.items(): |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
101 pattern = pattern.replace(value, key) |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
102 return pattern |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
103 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
104 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
105 def split_by_record(args, in_file, out_dir, top, ftype): |
5
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
106 # get configuration (record separator, start at end) for given filetype |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
107 sep, num, sep_at_end = FILETYPES.get(ftype, (args["generic_re"], args["generic_num"], args["split_after"])) |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
108 sep = re.compile(sep) |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
109 |
4
0850f2dfba13
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 872590086696cfbc248519507ecb9063926297ad"
bgruening
parents:
3
diff
changeset
|
110 chunksize = args["chunksize"] |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
111 numnew = args["numnew"] |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
112 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
113 # random division |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
114 rand = args["rand"] |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
115 seed = args["seed"] |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
116 if seed: |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
117 random.seed(seed) |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
118 else: |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
119 random.seed() |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
120 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
121 # batched division (maintains order) |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
122 batch = args["batch"] |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
123 |
5
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
124 # determine |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
125 # - the number of records that should be stored per file |
5
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
126 # (done always, even if used only for batch mode) |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
127 # - if the separator is a the start / end of the record |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
128 n_per_file = math.inf |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
129 if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
130 with open(in_file) as f: |
5
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
131 # read header lines |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
132 for i in range(top): |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
133 f.readline() |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
134 n_records = 0 |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
135 for line in f: |
5
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
136 if (num == 0 and re.match(sep, line) is not None) or (num > 0 and n_records % num == 0): |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
137 n_records += 1 |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
138 last_line_matched = True |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
139 else: |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
140 last_line_matched = False |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
141 if sep_at_end and not last_line_matched: |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
142 n_records += 1 |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
143 |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
144 # if there are fewer records than desired files |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
145 numnew = min(numnew, n_records) |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
146 # approx. number of records per file |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
147 if chunksize == 0: # i.e. no chunking |
4
0850f2dfba13
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 872590086696cfbc248519507ecb9063926297ad"
bgruening
parents:
3
diff
changeset
|
148 n_per_file = n_records // numnew |
0850f2dfba13
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 872590086696cfbc248519507ecb9063926297ad"
bgruening
parents:
3
diff
changeset
|
149 else: |
0850f2dfba13
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 872590086696cfbc248519507ecb9063926297ad"
bgruening
parents:
3
diff
changeset
|
150 numnew = n_records // chunksize |
0850f2dfba13
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 872590086696cfbc248519507ecb9063926297ad"
bgruening
parents:
3
diff
changeset
|
151 n_per_file = chunksize |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
152 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
153 # make new files |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
154 # strip extension of old file and add number |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
155 custom_new_file_name = args["file_names"] |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
156 custom_new_file_ext = "." + args["file_ext"] |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
157 if custom_new_file_name is None: |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
158 new_file_base = os.path.splitext(os.path.basename(in_file)) |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
159 else: |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
160 new_file_base = [custom_new_file_name, custom_new_file_ext] |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
161 |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
162 newfile_names = [os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) for count in range(0, numnew)] |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
163 # bunch o' counters |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
164 # index to list of new files |
5
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
165 if rand: |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
166 new_file_counter = int(math.floor(random.random() * numnew)) |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
167 else: |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
168 new_file_counter = 0 |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
169 new_file = open(newfile_names[new_file_counter], "a") |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
170 # to contain header specified by top |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
171 header = "" |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
172 # keep track of the files that have been opened so far |
5
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
173 fresh_files = set(range(numnew)) |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
174 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
175 # keep track in loop of number of records in each file |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
176 # only used in batch |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
177 records_in_file = 0 |
3
2ddc36385d7a
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 8d069684e155d2f5b6fae06d14d98ce41321da53"
bgruening
parents:
2
diff
changeset
|
178 |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
179 # open file |
5
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
180 with open(in_file, "r") as f: |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
181 # read header |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
182 for i in range(top): |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
183 header += f.readline() |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
184 |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
185 record = "" |
5
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
186 for line_no, line in enumerate(f): |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
187 # check if beginning of line is record sep |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
188 # if beginning of line is record sep, either start record or finish one |
5
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
189 if (num == 0 and re.match(sep, line) is not None) or (num > 0 and line_no % num == 0): |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
190 # this only happens first time through |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
191 if record == "": |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
192 record += line |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
193 else: |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
194 # if is in fresh_files, write header and drop from freshFiles |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
195 if new_file_counter in fresh_files: |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
196 new_file.write(header) |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
197 fresh_files.remove(new_file_counter) |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
198 |
5
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
199 if sep_at_end: |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
200 record += line |
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
201 # write record to file |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
202 new_file.write(record) |
5
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
203 if not sep_at_end: |
4
0850f2dfba13
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 872590086696cfbc248519507ecb9063926297ad"
bgruening
parents:
3
diff
changeset
|
204 record = line |
5
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
205 else: |
4
0850f2dfba13
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 872590086696cfbc248519507ecb9063926297ad"
bgruening
parents:
3
diff
changeset
|
206 record = "" |
5
e77b954f0da5
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
bgruening
parents:
4
diff
changeset
|
207 |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
208 # change destination file |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
209 if rand: |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
210 new_file_counter = int(math.floor(random.random() * numnew)) |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
211 new_file.close() |
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
212 new_file = open(newfile_names[new_file_counter], "a") |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
213 elif batch: |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
214 # number of records read per file |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
215 records_in_file += 1 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
216 # have we reached the max for each file? |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
217 # if so, switch file |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
218 if records_in_file >= n_per_file: |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
219 new_file_counter = (new_file_counter + 1) % numnew |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
220 records_in_file = 0 # reset to 0 |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
221 new_file.close() |
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
222 new_file = open(newfile_names[new_file_counter], "a") |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
223 else: |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
224 new_file_counter = (new_file_counter + 1) % numnew |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
225 new_file.close() |
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
226 new_file = open(newfile_names[new_file_counter], "a") |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
227 # if beginning of line is not record sep, we must be inside a record |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
228 # so just append |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
229 else: |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
230 record += line |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
231 # after loop, write final record to file |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
232 new_file.write(record) |
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
233 new_file.close() |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
234 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
235 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
236 def split_by_column(args, in_file, out_dir, top): |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
237 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
238 # shift to 0-based indexing |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
239 id_col = int(args["id_column"]) - 1 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
240 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
241 try: |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
242 match = re.compile(args["match"]) |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
243 except re.error: |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
244 print("ERROR: Match (-m) supplied is not valid regex.") |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
245 raise |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
246 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
247 sub = args["sub"] |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
248 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
249 # set of file names |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
250 files = set() |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
251 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
252 # keep track of how many lines have been read |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
253 n_read = 0 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
254 header = "" |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
255 with open(in_file) as file: |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
256 for line in file: |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
257 # if still in top, save to header |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
258 n_read += 1 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
259 if n_read <= top: |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
260 header += line |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
261 continue |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
262 # split into columns, on tab |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
263 fields = re.split(r'\t', line.strip('\n')) |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
264 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
265 # get id column value |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
266 id_col_val = fields[id_col] |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
267 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
268 # use regex to get new file name |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
269 out_file_name = re.sub(match, sub, id_col_val) |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
270 out_file_path = os.path.join(out_dir, out_file_name) |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
271 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
272 # write |
8
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
273 with open(out_file_path, "a") as current_new_file: |
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
274 if out_file_name not in files: |
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
275 current_new_file.write(header) |
6cbe2f30c2d7
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
bgruening
parents:
7
diff
changeset
|
276 files.add(out_file_name) |
7
0046692724f9
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6f78214d2c9d7786bfc9d8cbddac7d2613cd314e"
bgruening
parents:
6
diff
changeset
|
277 current_new_file.write(line) |
0
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
278 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
279 |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
280 if __name__ == "__main__": |
de3c2c88e710
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
bgruening
parents:
diff
changeset
|
281 main() |