Mercurial > repos > mvdbeek > dedup_hash
annotate test/test_dedup_hash.py @ 0:f33e9e6a6c88 draft default tip
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
author | mvdbeek |
---|---|
date | Wed, 23 Nov 2016 07:49:05 -0500 |
parents | |
children |
rev | line source |
---|---|
0
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
1 import hashlib |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
2 import inspect |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
3 import os |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
4 import subprocess |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
5 import sys |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
6 import tempfile |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
7 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
8 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
9 currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
10 parent_dir = os.path.dirname(currentdir) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
11 sys.path.insert(0, os.path.join(parent_dir, 'dedup_hash/')) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
12 import dedup_hash |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
13 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
14 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
15 TEST_DATA_DIR = os.path.join(parent_dir, 'test-data/') |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
16 UNCOMPRESSED_IN = ['r1.fastq', 'r2.fastq'] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
17 COMPRESSED_IN = ['r1.fastq.gz', 'r2.fastq.gz'] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
18 UNCOMPRESSED_OUT = ['r1_dedup.fastq', 'r2_dedup.fastq'] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
19 SINGLE_IN = ['r1.fastq'] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
20 SINGLE_OUT = ['r1_dedup.fastq'] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
21 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
22 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
23 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
24 def run(input): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
25 args = prepare_args(input) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
26 run_dedup(args) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
27 compare_output(args) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
28 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
29 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
30 def compare_output(args): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
31 ref_out1 = os.path.join(TEST_DATA_DIR, 'r1_dedup.fastq') |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
32 try: |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
33 assert md5(args['outfiles'][0]) == md5(ref_out1) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
34 except AssertionError: |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
35 cmd = "diff -Nru %s %s" % (args['outfiles'][0], ref_out1) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
36 subprocess.check_call(cmd.split(' ')) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
37 print('all good') |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
38 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
39 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
40 def prepare_args(test_files): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
41 infiles = [os.path.join(TEST_DATA_DIR, test_file) for test_file in test_files] |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
42 outfiles = [tempfile.NamedTemporaryFile(delete=False).name for test_file in test_files] # Same number of output files as input files |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
43 kwargs = {'infiles': infiles, |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
44 'outfiles': outfiles, |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
45 'write_gzip': False} |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
46 return kwargs |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
47 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
48 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
49 def run_dedup(kwargs): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
50 fastq_pairs_instance = dedup_hash.get_unique_fastq_instance() |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
51 fastq_pairs_instance(**kwargs) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
52 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
53 def md5(fname): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
54 hash_md5 = hashlib.md5() |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
55 with open(fname, "rb") as f: |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
56 for chunk in iter(lambda: f.read(4096), b""): |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
57 hash_md5.update(chunk) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
58 return hash_md5.hexdigest() |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
59 |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
60 if __name__ == '__main__': |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
61 run(UNCOMPRESSED_IN) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
62 run(COMPRESSED_IN) |
f33e9e6a6c88
planemo upload for repository https://github.com/mvdbeek/dedup_hash commit 367da560c5924d56c39f91ef9c731e523825424b-dirty
mvdbeek
parents:
diff
changeset
|
63 run(SINGLE_IN) |