Mercurial > repos > thondeboer > neat_genreads
comparison utilities/validateFQ.py @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
| author | thondeboer |
|---|---|
| date | Tue, 15 May 2018 02:39:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:6e75a84e9338 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 # | |
| 4 # A quickie tool for validating the correctness of a FASTQ file | |
| 5 # | |
| 6 # python validateFQ.py read1.fq [read2.fq] | |
| 7 # | |
| 8 | |
| 9 import sys | |
| 10 | |
| 11 def get4lines(fn): | |
| 12 l1 = fn.readline().strip() | |
| 13 l2 = fn.readline().strip() | |
| 14 l3 = fn.readline().strip() | |
| 15 l4 = fn.readline().strip() | |
| 16 if any([l1,l2,l3,l4]) and not all([l1,l2,l3,l4]): | |
| 17 print '\nError: missing lines:\n' | |
| 18 print l1+'\n'+l2+'\n'+l3+'\n'+l4+'\n' | |
| 19 exit(1) | |
| 20 return (l1,l2,l3,l4) | |
| 21 | |
| 22 ALLOWED_QUAL = '!\"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJ' | |
| 23 ALLOWED_NUCL = 'ACGTN' | |
| 24 | |
| 25 def validate4lines(l1,l2,l3,l4): | |
| 26 failed = 0 | |
| 27 # make sure lines contain correct delimiters | |
| 28 if l1[0] != '@' or l1[-2] != '/' or l3[0] != '+': | |
| 29 failed = 1 | |
| 30 # make sure seq len == qual length | |
| 31 if len(l2) != len(l4): | |
| 32 failed = 2 | |
| 33 # make sure seq string contains only valid characters | |
| 34 for n in l2: | |
| 35 if n not in ALLOWED_NUCL: | |
| 36 failed = 3 | |
| 37 # make sure qual string contains only valid characters | |
| 38 for n in l4: | |
| 39 if n not in ALLOWED_QUAL: | |
| 40 failed = 4 | |
| 41 if failed: | |
| 42 print '\nError: malformed lines:' | |
| 43 if failed == 1: print ' ---- invalid delimiters\n' | |
| 44 elif failed == 2: print ' ---- seq len != qual len\n' | |
| 45 elif failed == 3: print ' ---- seq contains invalid characters\n' | |
| 46 elif failed == 4: print ' ---- qual contains invalid characters\n' | |
| 47 print l1+'\n'+l2+'\n'+l3+'\n'+l4+'\n' | |
| 48 exit(1) | |
| 49 | |
| 50 f1 = open(sys.argv[1],'r') | |
| 51 (l1_r1, l2_r1, l3_r1, l4_r1) = get4lines(f1) | |
| 52 f2 = None | |
| 53 if len(sys.argv) == 3: | |
| 54 f2 = open(sys.argv[2],'r') | |
| 55 (l1_r2, l2_r2, l3_r2, l4_r2) = get4lines(f2) | |
| 56 | |
| 57 while l1_r1: | |
| 58 # check line syntax | |
| 59 validate4lines(l1_r1,l2_r1,l3_r1,l4_r1) | |
| 60 if f2 != None: | |
| 61 validate4lines(l1_r2,l2_r2,l3_r2,l4_r2) | |
| 62 # make sure seq id is same for r1/r2 | |
| 63 if l1_r1[:-1] != l1_r2[:-1]: | |
| 64 print '\nError: mismatched r1/r2 name:\n' | |
| 65 print l1_r1+'\n'+l1_r2+'\n' | |
| 66 exit(1) | |
| 67 | |
| 68 # grab next 4 lines... | |
| 69 (l1_r1, l2_r1, l3_r1, l4_r1) = get4lines(f1) | |
| 70 if f2 != None: | |
| 71 (l1_r2, l2_r2, l3_r2, l4_r2) = get4lines(f2) | |
| 72 | |
| 73 if f2 != None: | |
| 74 f2.close() | |
| 75 f1.close() | |
| 76 | |
| 77 print '\nPASSED WITH FLYING COLORS. GOOD DAY.\n' | |
| 78 |
