# HG changeset patch # User bgruening # Date 1511730831 18000 # Node ID 64469e7ecf9fbac42f07a2c220945550a4588d2d planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/join_files_on_column_fuzzy commit fd2bc86108994c9eda731b305ca6a8c71554cfaa diff -r 000000000000 -r 64469e7ecf9f join_files_on_column_fuzzy.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/join_files_on_column_fuzzy.py Sun Nov 26 16:13:51 2017 -0500 @@ -0,0 +1,119 @@ +#!/usr/bin/env python + +import os +import argparse +import sys + +def main(args): + + if args.header: + h1 = True + h2 = True + else: + h1 = False + h2 = False + + cache = list() + out = open(args.outfile, 'w+') + write_buffer = list() + + def _readline(header = False): + with open(args.f2) as handle2: + for line in handle2: + line = line.strip() + if header: + header = False + yield line + continue + if not line: + continue + columns = line.split(args.sep) + value2 = columns[args.c2-1] + yield columns, float(value2) + + def fill_cache(): + try: + cache.append(next(it)) + except StopIteration: + pass + + it = _readline(header = h2) + + with open(args.f1) as handle1: + for line in handle1: + line = line.strip() + if h1: + h1 = False + seconda_header = next(it) + if args.add_distance: + out.write('%s\t%s\t%s\n' % (line, seconda_header, args.unit)) + else: + out.write('%s\t%s\n' % (line, seconda_header)) + continue + if not line: + continue + columns = line.split(args.sep) + value1 = float(columns[args.c1-1]) + _cache = list() + fill_cache() + while cache: + _c, value2 = cache.pop(0) + upper_bound = value1 + args.distance + if args.unit == 'absolute': + if value2 <= upper_bound and value2 >= (value1 - args.distance): + line_template = '%s\n' + abs_dist = abs(value1 - value2) + if args.add_distance: + line_template = '%s\t' + str(abs_dist) + '\n' + write_buffer.append([abs_dist, line_template % '\t'.join( columns + _c )]) + _cache.append([_c, value2]) + fill_cache() + elif value2 > upper_bound: + # if the value from list 2 is bigger then the current value, he will be taken into the next round + _cache.append([_c, value2]) + elif value2 < upper_bound: + # if the value from list 2 is smaller then the currecnt value, check the next one of list 2 + fill_cache() + elif args.unit == 'ppm': + ppm_dist = abs((value1 - value2) / value1 * 1000000) + if ppm_dist <= args.distance: + line_template = '%s\n' + if args.add_distance: + line_template = '%s\t' + str(ppm_dist) + '\n' + write_buffer.append([ppm_dist, line_template % '\t'.join( columns + _c )]) + _cache.append([_c, value2]) + fill_cache() + elif ppm_dist > args.distance: + _cache.append([_c, value2]) + elif ppm_dist < args.distance: + fill_cache() + if args.closest and write_buffer: + write_buffer.sort(key=lambda x: x[0]) + out.write(write_buffer[0][1]) + else: + for _dist, line in write_buffer: + out.write(line) + write_buffer = list() + cache = _cache + out.close() + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='Merge two files on a common column the fuzzy way.') + parser.add_argument('--f1', required=True) + parser.add_argument('--f2', required=True) + parser.add_argument('--c1', type=int, required=True, help="Column in file 1 to be merged on.") + parser.add_argument('--c2', type=int, required=True, help="Column in file 2 to be merged on.") + parser.add_argument('--outfile', required=True) + parser.add_argument('--header', action='store_true', help="The files have a header line at the beginning.") + parser.add_argument('--closest', action='store_true', help="Only report the closest match.") + parser.add_argument('--add_distance', action='store_true', help="Add addional column with the distance between the two values.") + parser.add_argument('--sep', type=str, default="\t", help="Files are separated by this separator.") + parser.add_argument('--distance', type=float, default="0.2", help="Maximal allowed distance.") + parser.add_argument('--unit', choices=['ppm', 'absolute'], default='absolute') + args = parser.parse_args() + + main(args) + + diff -r 000000000000 -r 64469e7ecf9f join_files_on_column_fuzzy.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/join_files_on_column_fuzzy.xml Sun Nov 26 16:13:51 2017 -0500 @@ -0,0 +1,141 @@ + + + on column allowing a small difference + + + python + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r 64469e7ecf9f test-data/file1.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/file1.tab Sun Nov 26 16:13:51 2017 -0500 @@ -0,0 +1,10 @@ +1 one +2 two +3 three +4 four +5 five +6 six +7 seven +8 eight +9 nine +10 ten diff -r 000000000000 -r 64469e7ecf9f test-data/file1_header.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/file1_header.tab Sun Nov 26 16:13:51 2017 -0500 @@ -0,0 +1,11 @@ +#number1 desc1 +1 one +2 two +3 three +4 four +5 five +6 six +7 seven +8 eight +9 nine +10 ten diff -r 000000000000 -r 64469e7ecf9f test-data/file1_ppm.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/file1_ppm.tab Sun Nov 26 16:13:51 2017 -0500 @@ -0,0 +1,3 @@ +1221.12 first entry +1973.54 second entry +2233.44 third entry diff -r 000000000000 -r 64469e7ecf9f test-data/file2.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/file2.tab Sun Nov 26 16:13:51 2017 -0500 @@ -0,0 +1,43 @@ +1.1 should be true +1.1 should be true +1.1 should be true +1.2 should be false +1.3 should be false +1.4 should be false +1.5 should be false +1.6 should be false +1.7 should be false +1.8 should be false +1.9 should be false +2 should be true +2.1 should be false +2.2 should be false +2.3 should be false +2.4 should be false +2.5 should be false +2.6 should be false +2.7 should be false +2.8 should be false +2.9 should be false +3 should be true +3.1 should be false +3.2 should be false +3.3 should be false +3.4 should be false +3.5 should be false +3.6 should be false +3.7 should be false +3.8 should be false +3.9 should be false +4 should be true +4.1 should be false +4.2 should be false +4.3 should be false +4.4 should be false +4.5 should be false +4.6 should be false +4.7 should be false +4.8 should be false +4.9 should be false +5.1 should be true +10.1 should be true diff -r 000000000000 -r 64469e7ecf9f test-data/file2_header.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/file2_header.tab Sun Nov 26 16:13:51 2017 -0500 @@ -0,0 +1,44 @@ +#number2 desc2 +1.1 should be true +1.1 should be true +1.1 should be true +1.2 should be false +1.3 should be false +1.4 should be false +1.5 should be false +1.6 should be false +1.7 should be false +1.8 should be false +1.9 should be false +2 should be true +2.1 should be false +2.2 should be false +2.3 should be false +2.4 should be false +2.5 should be false +2.6 should be false +2.7 should be false +2.8 should be false +2.9 should be false +3 should be true +3.1 should be false +3.2 should be false +3.3 should be false +3.4 should be false +3.5 should be false +3.6 should be false +3.7 should be false +3.8 should be false +3.9 should be false +4 should be true +4.1 should be false +4.2 should be false +4.3 should be false +4.4 should be false +4.5 should be false +4.6 should be false +4.7 should be false +4.8 should be false +4.9 should be false +5.1 should be true +10.1 should be true diff -r 000000000000 -r 64469e7ecf9f test-data/file2_ppm.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/file2_ppm.tab Sun Nov 26 16:13:51 2017 -0500 @@ -0,0 +1,11 @@ +1221.13 match1 +1221.11 match2 +1221.15 match3 +1221 match4 +1973.5 match5 +1973.52 match6 +1973.57 match7 +1973.48 match8 +2233.4 match9 +2233.3 match10 +2233.5 match11 diff -r 000000000000 -r 64469e7ecf9f test-data/header_closest_result3.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/header_closest_result3.tab Sun Nov 26 16:13:51 2017 -0500 @@ -0,0 +1,7 @@ +#number1 desc1 #number2 desc2 +1 one 1.1 should be true +2 two 2 should be true +3 three 3 should be true +4 four 4 should be true +5 five 4.9 should be false +10 ten 10.1 should be true diff -r 000000000000 -r 64469e7ecf9f test-data/header_closest_result5.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/header_closest_result5.tab Sun Nov 26 16:13:51 2017 -0500 @@ -0,0 +1,7 @@ +#number1 desc1 #number2 desc2 absolute +1 one 1.1 should be true 0.10000000000000009 +2 two 2 should be true 0.0 +3 three 3 should be true 0.0 +4 four 4 should be true 0.0 +5 five 4.9 should be false 0.09999999999999964 +10 ten 10.1 should be true 0.09999999999999964 diff -r 000000000000 -r 64469e7ecf9f test-data/header_result2.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/header_result2.tab Sun Nov 26 16:13:51 2017 -0500 @@ -0,0 +1,24 @@ +#number1 desc1 #number2 desc2 +1 one 1.1 should be true +1 one 1.1 should be true +1 one 1.1 should be true +1 one 1.2 should be false +2 two 1.8 should be false +2 two 1.9 should be false +2 two 2 should be true +2 two 2.1 should be false +2 two 2.2 should be false +3 three 2.8 should be false +3 three 2.9 should be false +3 three 3 should be true +3 three 3.1 should be false +3 three 3.2 should be false +4 four 3.8 should be false +4 four 3.9 should be false +4 four 4 should be true +4 four 4.1 should be false +4 four 4.2 should be false +5 five 4.8 should be false +5 five 4.9 should be false +5 five 5.1 should be true +10 ten 10.1 should be true diff -r 000000000000 -r 64469e7ecf9f test-data/no_header_ppm_result4.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/no_header_ppm_result4.tab Sun Nov 26 16:13:51 2017 -0500 @@ -0,0 +1,11 @@ +1221.12 first entry 1221.13 match1 +1221.12 first entry 1221.11 match2 +1221.12 first entry 1221.15 match3 +1221.12 first entry 1221 match4 +1973.54 second entry 1973.5 match5 +1973.54 second entry 1973.52 match6 +1973.54 second entry 1973.57 match7 +1973.54 second entry 1973.48 match8 +2233.44 third entry 2233.4 match9 +2233.44 third entry 2233.3 match10 +2233.44 third entry 2233.5 match11 diff -r 000000000000 -r 64469e7ecf9f test-data/no_header_ppm_result6.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/no_header_ppm_result6.tab Sun Nov 26 16:13:51 2017 -0500 @@ -0,0 +1,11 @@ +1221.12 first entry 1221.13 match1 8.189203354476447 +1221.12 first entry 1221.11 match2 8.189203354290248 +1221.12 first entry 1221.15 match3 24.56761006305694 +1221.12 first entry 1221 match4 98.27044025148295 +1973.54 second entry 1973.5 match5 20.268147592632335 +1973.54 second entry 1973.52 match6 10.134073796316168 +1973.54 second entry 1973.57 match7 15.201110694474252 +1973.54 second entry 1973.48 match8 30.402221388948504 +2233.44 third entry 2233.4 match9 17.909592377661195 +2233.44 third entry 2233.3 match10 62.68357332181419 +2233.44 third entry 2233.5 match11 26.864388566491794 diff -r 000000000000 -r 64469e7ecf9f test-data/no_header_result1.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/no_header_result1.tab Sun Nov 26 16:13:51 2017 -0500 @@ -0,0 +1,15 @@ +1 one 1.1 should be true +1 one 1.1 should be true +1 one 1.1 should be true +2 two 1.9 should be false +2 two 2 should be true +2 two 2.1 should be false +3 three 2.9 should be false +3 three 3 should be true +3 three 3.1 should be false +4 four 3.9 should be false +4 four 4 should be true +4 four 4.1 should be false +5 five 4.9 should be false +5 five 5.1 should be true +10 ten 10.1 should be true