Mercurial > repos > abims-sbr > pairwise
comparison scripts/S01_run_first_blast.py @ 0:90b57ab0bd1d draft default tip
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
| author | abims-sbr |
|---|---|
| date | Fri, 01 Feb 2019 10:23:16 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:90b57ab0bd1d |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # coding: utf-8 | |
| 3 # Author : Victor Mataigne | |
| 4 | |
| 5 import itertools, argparse, os | |
| 6 | |
| 7 """ | |
| 8 IMPROVMENTS : | |
| 9 - Maybe a bit of code factoring | |
| 10 - See if it possible to avoid build several times the same db | |
| 11 """ | |
| 12 | |
| 13 # The script (and S03_run_second_blast.py as well) must be launched with the python '-W ignore' option if tested with planemo | |
| 14 | |
| 15 def main(): | |
| 16 parser = argparse.ArgumentParser() | |
| 17 parser.add_argument('files', help='fasta files separated by commas') | |
| 18 parser.add_argument('evalue', help='evalue for blast') | |
| 19 parser.add_argument('method', choices=['tblastx', 'diamond'], help='alignment tool (tblastx or diamond)') | |
| 20 args = parser.parse_args() | |
| 21 | |
| 22 in_files = args.files.split(',') | |
| 23 | |
| 24 if args.method == 'diamond': | |
| 25 in_files_translated = [] | |
| 26 from Bio.Seq import Seq | |
| 27 from Bio.Alphabet import IUPAC | |
| 28 | |
| 29 # From every sequence, make three sequences (translations in the three reading frames) | |
| 30 print 'Translating every sequence in all reading frames ...' | |
| 31 for file in in_files: | |
| 32 name = 'translated_%s' %file | |
| 33 in_files_translated.append(name) | |
| 34 translated_file = open(name, 'w') | |
| 35 with open(file, 'r') as file: | |
| 36 for name, seq in itertools.izip_longest(*[file]*2): | |
| 37 s = Seq(seq.strip('\n').upper(), IUPAC.ambiguous_dna) | |
| 38 translated_file.write(name.strip('\n')+'_orf_1\n') | |
| 39 translated_file.write(s.translate()._data+'\n') | |
| 40 translated_file.write(name.strip('\n')+'_orf_2\n') | |
| 41 translated_file.write(s[1:].translate()._data+'\n') | |
| 42 translated_file.write(name.strip('\n')+'_orf_3\n') | |
| 43 translated_file.write(s[2:].translate()._data+'\n') | |
| 44 translated_file.close() | |
| 45 | |
| 46 # Make the list of all pairwise combinations | |
| 47 list_pairwise = itertools.combinations(in_files_translated, 2) | |
| 48 | |
| 49 elif args.method == 'tblastx': | |
| 50 list_pairwise = itertools.combinations(in_files, 2) | |
| 51 | |
| 52 else : | |
| 53 print 'Mispecified alignment tool' | |
| 54 exit() | |
| 55 | |
| 56 os.mkdir('outputs_RBH_dna') | |
| 57 | |
| 58 # Main loop | |
| 59 | |
| 60 if args.method == 'diamond': | |
| 61 for pairwise in list_pairwise: | |
| 62 print "Pair of species:" | |
| 63 print pairwise | |
| 64 | |
| 65 sp1, sp2 = pairwise[0].split('_')[1], pairwise[1].split('_')[1] #rename 'translated_Xx_transcriptom.fasta' | |
| 66 sub_directory_name = sp1 + '_' + sp2 | |
| 67 os.mkdir('./blast_%s' %sub_directory_name) | |
| 68 | |
| 69 print 'Running first blast with Diamond ...' | |
| 70 | |
| 71 # Run diamond | |
| 72 os.system('diamond makedb --in %s -d %s >> log_diamond.log' %(pairwise[1], sp2)) | |
| 73 os.system('diamond blastp -q %s -d %s --max-target-seqs 1 -o matches_blast1_%s -e %s --more-sensitive >> log_diamond.log' %(pairwise[0], sp2, sub_directory_name, args.evalue)) | |
| 74 | |
| 75 # tabular output : | |
| 76 # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore | |
| 77 | |
| 78 a = pairwise[1].replace('translated_', '') | |
| 79 b = pairwise[0].replace('translated_', '') | |
| 80 | |
| 81 # There is a chance to have no hits returned | |
| 82 if os.path.getsize('matches_blast1_%s' %sub_directory_name) == 0: | |
| 83 print 'No hits found. Processing next species pair ...' | |
| 84 else : | |
| 85 | |
| 86 # Record only one best_hit per transcript (best of the 6 orfs) | |
| 87 os.system('python S02_04_keep_one_hit_from_blast.py matches_blast1_%s %s %s %s %s %s' %(sub_directory_name, a, b, sub_directory_name, '1', args.method)) | |
| 88 | |
| 89 # 2d blast with only best hits as db | |
| 90 print 'Running second blast with Diamond ... ' | |
| 91 | |
| 92 os.system('python -W ignore S03_run_second_blast.py best_hits_db_blast1_%s %s %s %s %s' %(sub_directory_name, pairwise[0], sub_directory_name, args.evalue, args.method)) | |
| 93 | |
| 94 # Record only one best_hit per transcript (best of the 6 orfs) | |
| 95 os.system('python S02_04_keep_one_hit_from_blast.py matches_blast2_%s %s %s %s %s %s' %(sub_directory_name, b, a, sub_directory_name, '2', args.method)) | |
| 96 | |
| 97 # Find Reciprocical Best Hits | |
| 98 name1 = 'best_hits_q_blast1_{}'.format(sub_directory_name) | |
| 99 name2 = 'best_hits_q_blast2_{}'.format(sub_directory_name) | |
| 100 os.system('python S05_find_rbh.py %s %s ' %(name1, name2)) | |
| 101 | |
| 102 os.system('mv log_diamond.log ./blast_%s' %sub_directory_name) | |
| 103 os.system('rm -f *.dmnd') | |
| 104 | |
| 105 # Those files exist obly if hits were found during the first blast | |
| 106 if os.path.getsize('matches_blast1_%s' %sub_directory_name) != 0: | |
| 107 os.system('mv *best_hits* ./blast_%s' %sub_directory_name) | |
| 108 os.system('mv RBH* outputs_RBH_dna') | |
| 109 | |
| 110 os.system('mv matches_blast* ./blast_%s' %(sub_directory_name)) | |
| 111 | |
| 112 os.mkdir('translated_seqs') | |
| 113 os.system('mv translated*.fasta ./translated_seqs') | |
| 114 | |
| 115 elif args.method == 'tblastx': | |
| 116 for pairwise in list_pairwise: | |
| 117 print "Pair of species:" | |
| 118 print pairwise | |
| 119 | |
| 120 sp1, sp2 = pairwise[0].split('_')[0], pairwise[1].split('_')[0] | |
| 121 sub_directory_name = sp1 + '_' + sp2 | |
| 122 os.mkdir('./blast_%s' %sub_directory_name) | |
| 123 | |
| 124 print 'Running first tblastx ...' | |
| 125 | |
| 126 # Run diamond | |
| 127 os.system('formatdb -i %s -p F -o T >> log_tblastx.log' %(pairwise[1])) | |
| 128 os.system('blastall -p tblastx -d %s -i %s -o matches_blast1_%s -T F -e %s -F "mS" -b1 -v1 -K 1 -m 8 >> log_tblastx.log' %(pairwise[1], pairwise[0], sub_directory_name, args.evalue)) | |
| 129 | |
| 130 # tabular output : | |
| 131 # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore | |
| 132 | |
| 133 # There is a chance to have no hits returned | |
| 134 if os.path.getsize('matches_blast1_%s' %sub_directory_name) == 0: | |
| 135 print 'No hits found. Processing next species pair ...' | |
| 136 else: | |
| 137 | |
| 138 # Record only one best_hit per transcript (best of the 6 orfs) | |
| 139 os.system('python S02_04_keep_one_hit_from_blast.py matches_blast1_%s %s %s %s %s %s' %(sub_directory_name, pairwise[1], pairwise[0], sub_directory_name, '1', args.method)) | |
| 140 | |
| 141 # 2d blast with only best hits as db | |
| 142 print 'Running second blast with Diamond ... ' | |
| 143 | |
| 144 os.system('python S03_run_second_blast.py best_hits_db_blast1_%s %s %s %s %s' %(sub_directory_name, pairwise[0], sub_directory_name, args.evalue, args.method)) | |
| 145 | |
| 146 # Record only one best_hit per transcript (best of the 6 orfs) | |
| 147 os.system('python S02_04_keep_one_hit_from_blast.py matches_blast2_%s %s %s %s %s %s' %(sub_directory_name, pairwise[0], pairwise[1], sub_directory_name, '2', args.method)) | |
| 148 | |
| 149 # Find Reciprocical Best Hits | |
| 150 name1 = 'best_hits_q_blast1_{}'.format(sub_directory_name) | |
| 151 name2 = 'best_hits_q_blast2_{}'.format(sub_directory_name) | |
| 152 os.system('python S05_find_rbh.py %s %s ' %(name1, name2)) | |
| 153 | |
| 154 os.system('mv log_tblastx.log ./blast_%s' %sub_directory_name) | |
| 155 os.system('rm -f *.nhr') | |
| 156 os.system('rm -f *.nin') | |
| 157 os.system('rm -f *.nsd') | |
| 158 os.system('rm -f *.nsi') | |
| 159 os.system('rm -f *.nsq') | |
| 160 | |
| 161 # Those files exist obly if hits were found during the first blast | |
| 162 if os.path.getsize('matches_blast1_%s' %sub_directory_name) != 0: | |
| 163 os.system('mv *best_hits* ./blast_%s' %sub_directory_name) | |
| 164 os.system('mv RBH* outputs_RBH_dna') | |
| 165 | |
| 166 os.system('mv matches_blast* ./blast_%s' %(sub_directory_name)) | |
| 167 #os.system('mv matches_blast2_%s ./blast_%s' %(sub_directory_name, sub_directory_name)) | |
| 168 | |
| 169 if __name__ == "__main__": | |
| 170 main() |
