Mercurial > repos > rnateam > graphclust_preprocessing
comparison splitStockholm.py @ 11:c0c9d19bc7b2 draft
planemo upload for repository https://github.com/eteriSokhoyan/galaxytools/tree/branchForIterations/tools/GraphClust commit 746497a64b955f6b9afc1944d1c1d8d877e53267
| author | rnateam |
|---|---|
| date | Tue, 18 Jul 2017 01:43:49 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 10:16bcaef3dc1e | 11:c0c9d19bc7b2 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 ######## | |
| 4 # This script reads multiple alignments merged in single stockholm file | |
| 5 # and splits the alignment blocks according to data.names table | |
| 6 # The first sequence of each alignment file assumed to match to names table entries | |
| 7 # Author: M. Miladi | |
| 8 ######## | |
| 9 import os | |
| 10 import re | |
| 11 import sys | |
| 12 | |
| 13 from Bio import AlignIO, SeqIO | |
| 14 try: | |
| 15 from StringIO import StringIO | |
| 16 except ImportError: | |
| 17 from io import StringIO | |
| 18 | |
| 19 stk_file = sys.argv[1] | |
| 20 print ("Parsing and splitting stk file:{}".format(stk_file)) | |
| 21 target_f = "alignment_data_split.stk" | |
| 22 pattern = re.compile("^>.*$") | |
| 23 toWriteID = "" | |
| 24 | |
| 25 count_for_id = 1 | |
| 26 seq_counter = 0 | |
| 27 new_id = "" | |
| 28 | |
| 29 seq_id = [] | |
| 30 seq_string = [] | |
| 31 orig_id = [] | |
| 32 name_file = "FASTA/data.names" | |
| 33 array_all_chunks = [] | |
| 34 with open(name_file, 'r') as f: | |
| 35 for line in f: | |
| 36 if len(line.strip()) == 0: | |
| 37 continue | |
| 38 seq_id.append(int(line.split()[0])) | |
| 39 seq_string.append(line.split()[1]) | |
| 40 orig_id_srt = line.split()[3] | |
| 41 orig_id_srt = orig_id_srt.rsplit('_',1)[0] | |
| 42 orig_id.append(orig_id_srt) | |
| 43 | |
| 44 | |
| 45 | |
| 46 with open(stk_file) as stk_in: | |
| 47 alignments = AlignIO.parse(stk_in, "stockholm")#, alphabet=IUPAC.ambiguous_rna) | |
| 48 alignments_dic = {(a[0].id):a for a in alignments} | |
| 49 | |
| 50 | |
| 51 regx_gaps = '[-.~_]' # valid gap symbols all be converted to "-" | |
| 52 str_gaps = '-.~_' # valid gap symbols all be converted to "-" | |
| 53 | |
| 54 | |
| 55 chunks = [] | |
| 56 with open(target_f, 'w') as out_stk_handle: | |
| 57 for i in range(len(orig_id)): | |
| 58 | |
| 59 #---------------------- | |
| 60 # We need to map ungapped positions of the chunks to gapped positions of first sequence | |
| 61 gap_count = 0 | |
| 62 ungap_ind = 0 | |
| 63 dic_gap_counts = dict() | |
| 64 cur_alignment = alignments_dic[orig_id[i]] | |
| 65 for c in cur_alignment[0].seq: | |
| 66 #print ungap_ind | |
| 67 if c in str_gaps: | |
| 68 gap_count += 1 | |
| 69 else: | |
| 70 dic_gap_counts[ungap_ind] = gap_count | |
| 71 ungap_ind += 1 | |
| 72 ID = str(seq_id[i]) + " " + seq_string[i] | |
| 73 chunks = re.findall(r'\d+', seq_string[i]) | |
| 74 print (ID,chunks) | |
| 75 | |
| 76 index_start, index_end =int(chunks[1])-1, int(chunks[2])-1 | |
| 77 subalign = cur_alignment[:, index_start + dic_gap_counts[index_start]: | |
| 78 index_end+dic_gap_counts[index_end]+1] | |
| 79 | |
| 80 #---------------------- | |
| 81 # BioPython does not handel the GF ID entry for alignment | |
| 82 # So we add entry in the second line manually | |
| 83 siotmp = StringIO() | |
| 84 AlignIO.write(subalign, siotmp, format="stockholm") | |
| 85 stk_lines = siotmp.getvalue().split('\n') | |
| 86 out_stk_handle.write('{}\n'.format(stk_lines[0])) | |
| 87 out_stk_handle.write('#=GF ID {}\n'.format(ID)) | |
| 88 out_stk_handle.writelines('\n'.join(stk_lines[1:])) | |
| 89 #print out_stk_handle.getvalue() | |
| 90 | |
| 91 |
