Mercurial > repos > gdroc > scaffremodler
comparison scaffremodler/group4contig.py @ 0:66885fa414c8 draft default tip
Uploaded
| author | gdroc |
|---|---|
| date | Mon, 14 Nov 2016 08:31:23 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:66885fa414c8 |
|---|---|
| 1 | |
| 2 # | |
| 3 # Copyright 2014 CIRAD | |
| 4 # | |
| 5 # This program is free software; you can redistribute it and/or modify | |
| 6 # it under the terms of the GNU General Public License as published by | |
| 7 # the Free Software Foundation; either version 3 of the License, or | |
| 8 # (at your option) any later version. | |
| 9 # | |
| 10 # This program is distributed in the hope that it will be useful, | |
| 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 13 # GNU General Public License for more details. | |
| 14 # | |
| 15 # You should have received a copy of the GNU General Public License | |
| 16 # along with this program; if not, see <http://www.gnu.org/licenses/> or | |
| 17 # write to the Free Software Foundation, Inc., | |
| 18 # 51 Franklin Street, Fifth Floor, Boston, | |
| 19 # MA 02110-1301, USA. | |
| 20 # | |
| 21 # | |
| 22 | |
| 23 import optparse, os, shutil, subprocess, sys, tempfile, fileinput, ConfigParser, operator, time | |
| 24 | |
| 25 | |
| 26 | |
| 27 def __main__(): | |
| 28 #Parse Command Line | |
| 29 parser = optparse.OptionParser(usage="python %prog [options]\n\nProgram designed by Guillaume MARTIN : guillaume.martin@cirad.fr\n\n" | |
| 30 "This script takes scaffold to join and group them by linkage. The input is a tabulated file looking as folowed:\n" | |
| 31 "scaffold1 1 2458028 FWD scaffold2 1 1 FWD FWD contig\n" | |
| 32 "scaffold36 1 250001 FWD scaffold5 2000000 2000000 FWD REV contig") | |
| 33 # Wrapper options. | |
| 34 parser.add_option( '', '--table', dest='table', help='The table file input') | |
| 35 parser.add_option( '', '--out', dest='out', default='intermediate_junction.txt', help='The output file name, [default: %default]') | |
| 36 (options, args) = parser.parse_args() | |
| 37 | |
| 38 # Filling dictionnary | |
| 39 dico_line = set() | |
| 40 file = open(options.table) | |
| 41 for line in file: | |
| 42 data = line.split() | |
| 43 if data: | |
| 44 if data[3] != 'FWD' or data[7] != 'FWD': | |
| 45 sys.exit('Warning! the orientation filled in column 4 or 8 is not managed, this column should contain "FWD" ') | |
| 46 dico_line.add(line) | |
| 47 file.close() | |
| 48 | |
| 49 dico = set() # --> to identify already treated lines | |
| 50 dico_scaff = {} # --> a hash table containing a list of three elements : (1) grouped scaffold ordered, (2) scaffold orientation in the group, (3) ungrouped scaffolds, (4) scaffold number | |
| 51 i = 0 | |
| 52 for line in dico_line: | |
| 53 data = line.split() | |
| 54 if data: | |
| 55 if not(line in dico): # --> this line start a new group | |
| 56 i += 1 | |
| 57 # print i | |
| 58 dico.add(line) # --> record a line already treated | |
| 59 dico_scaff[i] = [[data[4]],[data[7]],set(), 1] | |
| 60 taille_dico = int(dico_scaff[i][3]) # --> to define a group is inflating or not (in the while loop) | |
| 61 if data[5] == '1':# --> scaffold should be inserted at the begining of the list | |
| 62 dico_scaff[i][0].insert(0,data[0]) | |
| 63 dico_scaff[i][1].insert(0,data[8]) | |
| 64 dico_scaff[i][3] += 1 | |
| 65 else: | |
| 66 dico_scaff[i][0].append(data[0]) | |
| 67 dico_scaff[i][1].append(data[8]) | |
| 68 dico_scaff[i][3] += 1 | |
| 69 # --> The new group as been stared | |
| 70 while taille_dico < dico_scaff[i][3]: # --> verification is new scaffolds have been added to group | |
| 71 taille_dico = int(dico_scaff[i][3]) | |
| 72 # print dico_scaff[i][0], dico_scaff[i][1], len(dico_scaff[i][2]), dico_scaff[i][3] | |
| 73 for line2 in dico_line: | |
| 74 data2 = line2.split() | |
| 75 if data2 and not(line2 in dico): | |
| 76 if data2[4] in dico_scaff[i][0]: # --> checking if this scaffold is already in the group | |
| 77 dico.add(line2) | |
| 78 dico_scaff[i][3] += 1 | |
| 79 if data2[4] == dico_scaff[i][0][-1]: # --> this scaffold is at the end of the list | |
| 80 if dico_scaff[i][1][-1] == 'FWD': # --> if this scaffold is FWD orientated | |
| 81 if data2[5] == '1': # --> the new scaffold could not be inserted | |
| 82 dico_scaff[i][2].add(line2) | |
| 83 # print 'toto1' | |
| 84 else: # --> the new scaffold could be inserted at the end | |
| 85 dico_scaff[i][0].append(data2[0]) | |
| 86 dico_scaff[i][1].append(data2[8]) | |
| 87 elif dico_scaff[i][1][-1] == 'REV': # --> if this scaffold is REV orientated | |
| 88 if data2[5] != '1': # --> the new scaffold could not be inserted | |
| 89 dico_scaff[i][2].add(line2) | |
| 90 # print 'toto2' | |
| 91 else: # --> the new scaffold could be inserted at the end but its orientation should be reverted | |
| 92 dico_scaff[i][0].append(data2[0]) | |
| 93 if data2[8] == 'FWD': | |
| 94 dico_scaff[i][1].append('REV') | |
| 95 elif data2[8] == 'REV': | |
| 96 dico_scaff[i][1].append('FWD') | |
| 97 else: | |
| 98 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') | |
| 99 else: | |
| 100 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') | |
| 101 elif data2[4] == dico_scaff[i][0][0]: # --> this scaffold is at the begining of the list | |
| 102 if dico_scaff[i][1][0] == 'FWD': # --> if this scaffold is FWD orientated | |
| 103 if data2[5] == '1': # --> the new scaffold could be inserted at the begining | |
| 104 dico_scaff[i][0].insert(0,data2[0]) | |
| 105 dico_scaff[i][1].insert(0,data2[8]) | |
| 106 else:# --> the new scaffold could not be inserted | |
| 107 dico_scaff[i][2].add(line2) | |
| 108 # print 'toto3' | |
| 109 elif dico_scaff[i][1][0] == 'REV': # --> if this scaffold is REV orientated | |
| 110 if data2[5] == '1': # --> the new scaffold could not be inserted | |
| 111 dico_scaff[i][2].add(line2) | |
| 112 # print 'toto4' | |
| 113 else: # --> the new scaffold could be inserted at the begining but its orientation should be reverted | |
| 114 dico_scaff[i][0].insert(0,data2[0]) | |
| 115 if data2[8] == 'FWD': | |
| 116 dico_scaff[i][1].insert(0,'REV') | |
| 117 elif data2[8] == 'REV': | |
| 118 dico_scaff[i][1].insert(0,'FWD') | |
| 119 else: | |
| 120 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') | |
| 121 else: | |
| 122 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') | |
| 123 else: | |
| 124 dico_scaff[i][2].add(line2) | |
| 125 # print 'toto5' | |
| 126 elif data2[0] in dico_scaff[i][0]: # --> checking if this scaffold is already in the group | |
| 127 dico.add(line2) | |
| 128 dico_scaff[i][3] += 1 | |
| 129 if data2[0] == dico_scaff[i][0][-1]: # --> this scaffold is at the end of the list | |
| 130 if dico_scaff[i][1][-1] == 'FWD': # --> if this scaffold is FWD orientated | |
| 131 if data2[5] == '1': | |
| 132 if data2[8] == 'FWD': # --> the new scaffold could be inserted at the end | |
| 133 dico_scaff[i][0].append(data2[4]) | |
| 134 dico_scaff[i][1].append(data2[7]) | |
| 135 elif data2[8] == 'REV': # --> the new scaffold could not be inserted | |
| 136 dico_scaff[i][2].add(line2) | |
| 137 # print 'toto6' | |
| 138 else: | |
| 139 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') | |
| 140 else: | |
| 141 if data2[8] == 'REV': # --> the new scaffold could be inserted at the end | |
| 142 dico_scaff[i][0].append(data2[4]) | |
| 143 dico_scaff[i][1].append('REV') | |
| 144 elif data2[8] == 'FWD': # --> the new scaffold could not be inserted | |
| 145 dico_scaff[i][2].add(line2) | |
| 146 # print 'toto7' | |
| 147 else: | |
| 148 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') | |
| 149 elif dico_scaff[i][1][-1] == 'REV': # --> if this scaffold is REV orientated | |
| 150 if data2[5] == '1': | |
| 151 if data2[8] == 'REV': # --> the new scaffold could be inserted at the end | |
| 152 dico_scaff[i][0].append(data2[4]) | |
| 153 dico_scaff[i][1].append(data2[7]) | |
| 154 elif data2[8] == 'FWD': # --> the new scaffold could not be inserted | |
| 155 dico_scaff[i][2].add(line2) | |
| 156 # print 'toto8' | |
| 157 else: | |
| 158 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') | |
| 159 else: | |
| 160 if data2[8] == 'FWD': # --> the new scaffold could be inserted at the end | |
| 161 dico_scaff[i][0].append(data2[4]) | |
| 162 dico_scaff[i][1].append('REV') | |
| 163 elif data2[8] == 'REV': # --> the new scaffold could not be inserted | |
| 164 dico_scaff[i][2].add(line2) | |
| 165 # print 'toto9' | |
| 166 else: | |
| 167 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') | |
| 168 else: | |
| 169 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') | |
| 170 elif data2[0] == dico_scaff[i][0][0]: # --> this scaffold is at the begining of the list | |
| 171 if dico_scaff[i][1][0] == 'FWD': # --> if this scaffold is FWD orientated | |
| 172 if data2[5] != '1': | |
| 173 if data2[8] == 'FWD': # --> the new scaffold could be inserted at the begining | |
| 174 dico_scaff[i][0].insert(0,data2[4]) | |
| 175 dico_scaff[i][1].insert(0,data2[7]) | |
| 176 elif data2[8] == 'REV': # --> the new scaffold could not be inserted | |
| 177 dico_scaff[i][2].add(line2) | |
| 178 # print 'toto10' | |
| 179 else: | |
| 180 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') | |
| 181 else: | |
| 182 if data2[8] == 'REV': # --> the new scaffold could be inserted at begining | |
| 183 dico_scaff[i][0].insert(0,data2[4]) | |
| 184 dico_scaff[i][1].insert(0,'REV') | |
| 185 elif data2[8] == 'FWD': # --> the new scaffold could not be inserted | |
| 186 dico_scaff[i][2].add(line2) | |
| 187 # print 'toto11' | |
| 188 else: | |
| 189 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') | |
| 190 elif dico_scaff[i][1][0] == 'REV': # --> if this scaffold is REV orientated | |
| 191 if data2[5] == '1': | |
| 192 if data2[8] == 'FWD': # --> the new scaffold could be inserted at the begining | |
| 193 dico_scaff[i][0].insert(0,data2[4]) | |
| 194 dico_scaff[i][1].insert(0,'REV') | |
| 195 elif data2[8] == 'REV': # --> the new scaffold could not be inserted | |
| 196 dico_scaff[i][2].add(line2) | |
| 197 # print 'toto12' | |
| 198 else: | |
| 199 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') | |
| 200 else: | |
| 201 if data2[8] == 'REV': # --> the new scaffold could be inserted at begining | |
| 202 dico_scaff[i][0].insert(0,data2[4]) | |
| 203 dico_scaff[i][1].insert(0,data2[7]) | |
| 204 elif data2[8] == 'FWD': # --> the new scaffold could not be inserted | |
| 205 dico_scaff[i][2].add(line2) | |
| 206 # print 'toto13' | |
| 207 else: | |
| 208 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') | |
| 209 else: | |
| 210 sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') | |
| 211 else: | |
| 212 dico_scaff[i][2].add(line2) | |
| 213 # print 'toto14' | |
| 214 | |
| 215 | |
| 216 outfile = open(options.out,'w') | |
| 217 for n in dico_scaff: | |
| 218 liste = list(dico_scaff[n][0]) | |
| 219 liste.sort() | |
| 220 outfile.write('>'+liste[0]+'\n') | |
| 221 # print '>'+liste[0] | |
| 222 for k in dico_scaff[n][0]: | |
| 223 outfile.write('\t'.join([k,dico_scaff[n][1][dico_scaff[n][0].index(k)]])+'\n') | |
| 224 # print '\t'.join([k,dico_scaff[n][1][dico_scaff[n][0].index(k)]]) | |
| 225 for k in dico_scaff[n][2]: | |
| 226 data = k.split() | |
| 227 if data[5] == '1': | |
| 228 if data[3] == data[8]: | |
| 229 outfile.write('not_grouped\t'+data[0]+'\tFWD\t'+data[4]+'\t'+data[7]+'\n') | |
| 230 else: | |
| 231 outfile.write('not_grouped\t'+data[0]+'\tREV\t'+data[4]+'\t'+data[7]+'\n') | |
| 232 else: | |
| 233 if data[3] == data[8]: | |
| 234 outfile.write('not_grouped\t'+data[4]+'\t'+data[7]+'\t'+data[0]+'\tFWD'+'\n') | |
| 235 else: | |
| 236 outfile.write('not_grouped\t'+data[4]+'\t'+data[7]+'\t'+data[0]+'\tREV'+'\n') | |
| 237 outfile.close() | |
| 238 | |
| 239 | |
| 240 | |
| 241 if __name__ == "__main__": __main__() |
