Mercurial > repos > gdroc > scaffremodler
view scaffremodler/group4contig.py @ 0:66885fa414c8 draft default tip
Uploaded
| author | gdroc |
|---|---|
| date | Mon, 14 Nov 2016 08:31:23 -0500 |
| parents | |
| children |
line wrap: on
line source
# # Copyright 2014 CIRAD # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, see <http://www.gnu.org/licenses/> or # write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, # MA 02110-1301, USA. # # import optparse, os, shutil, subprocess, sys, tempfile, fileinput, ConfigParser, operator, time def __main__(): #Parse Command Line parser = optparse.OptionParser(usage="python %prog [options]\n\nProgram designed by Guillaume MARTIN : guillaume.martin@cirad.fr\n\n" "This script takes scaffold to join and group them by linkage. The input is a tabulated file looking as folowed:\n" "scaffold1 1 2458028 FWD scaffold2 1 1 FWD FWD contig\n" "scaffold36 1 250001 FWD scaffold5 2000000 2000000 FWD REV contig") # Wrapper options. parser.add_option( '', '--table', dest='table', help='The table file input') parser.add_option( '', '--out', dest='out', default='intermediate_junction.txt', help='The output file name, [default: %default]') (options, args) = parser.parse_args() # Filling dictionnary dico_line = set() file = open(options.table) for line in file: data = line.split() if data: if data[3] != 'FWD' or data[7] != 'FWD': sys.exit('Warning! the orientation filled in column 4 or 8 is not managed, this column should contain "FWD" ') dico_line.add(line) file.close() dico = set() # --> to identify already treated lines dico_scaff = {} # --> a hash table containing a list of three elements : (1) grouped scaffold ordered, (2) scaffold orientation in the group, (3) ungrouped scaffolds, (4) scaffold number i = 0 for line in dico_line: data = line.split() if data: if not(line in dico): # --> this line start a new group i += 1 # print i dico.add(line) # --> record a line already treated dico_scaff[i] = [[data[4]],[data[7]],set(), 1] taille_dico = int(dico_scaff[i][3]) # --> to define a group is inflating or not (in the while loop) if data[5] == '1':# --> scaffold should be inserted at the begining of the list dico_scaff[i][0].insert(0,data[0]) dico_scaff[i][1].insert(0,data[8]) dico_scaff[i][3] += 1 else: dico_scaff[i][0].append(data[0]) dico_scaff[i][1].append(data[8]) dico_scaff[i][3] += 1 # --> The new group as been stared while taille_dico < dico_scaff[i][3]: # --> verification is new scaffolds have been added to group taille_dico = int(dico_scaff[i][3]) # print dico_scaff[i][0], dico_scaff[i][1], len(dico_scaff[i][2]), dico_scaff[i][3] for line2 in dico_line: data2 = line2.split() if data2 and not(line2 in dico): if data2[4] in dico_scaff[i][0]: # --> checking if this scaffold is already in the group dico.add(line2) dico_scaff[i][3] += 1 if data2[4] == dico_scaff[i][0][-1]: # --> this scaffold is at the end of the list if dico_scaff[i][1][-1] == 'FWD': # --> if this scaffold is FWD orientated if data2[5] == '1': # --> the new scaffold could not be inserted dico_scaff[i][2].add(line2) # print 'toto1' else: # --> the new scaffold could be inserted at the end dico_scaff[i][0].append(data2[0]) dico_scaff[i][1].append(data2[8]) elif dico_scaff[i][1][-1] == 'REV': # --> if this scaffold is REV orientated if data2[5] != '1': # --> the new scaffold could not be inserted dico_scaff[i][2].add(line2) # print 'toto2' else: # --> the new scaffold could be inserted at the end but its orientation should be reverted dico_scaff[i][0].append(data2[0]) if data2[8] == 'FWD': dico_scaff[i][1].append('REV') elif data2[8] == 'REV': dico_scaff[i][1].append('FWD') else: sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') else: sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') elif data2[4] == dico_scaff[i][0][0]: # --> this scaffold is at the begining of the list if dico_scaff[i][1][0] == 'FWD': # --> if this scaffold is FWD orientated if data2[5] == '1': # --> the new scaffold could be inserted at the begining dico_scaff[i][0].insert(0,data2[0]) dico_scaff[i][1].insert(0,data2[8]) else:# --> the new scaffold could not be inserted dico_scaff[i][2].add(line2) # print 'toto3' elif dico_scaff[i][1][0] == 'REV': # --> if this scaffold is REV orientated if data2[5] == '1': # --> the new scaffold could not be inserted dico_scaff[i][2].add(line2) # print 'toto4' else: # --> the new scaffold could be inserted at the begining but its orientation should be reverted dico_scaff[i][0].insert(0,data2[0]) if data2[8] == 'FWD': dico_scaff[i][1].insert(0,'REV') elif data2[8] == 'REV': dico_scaff[i][1].insert(0,'FWD') else: sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') else: sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') else: dico_scaff[i][2].add(line2) # print 'toto5' elif data2[0] in dico_scaff[i][0]: # --> checking if this scaffold is already in the group dico.add(line2) dico_scaff[i][3] += 1 if data2[0] == dico_scaff[i][0][-1]: # --> this scaffold is at the end of the list if dico_scaff[i][1][-1] == 'FWD': # --> if this scaffold is FWD orientated if data2[5] == '1': if data2[8] == 'FWD': # --> the new scaffold could be inserted at the end dico_scaff[i][0].append(data2[4]) dico_scaff[i][1].append(data2[7]) elif data2[8] == 'REV': # --> the new scaffold could not be inserted dico_scaff[i][2].add(line2) # print 'toto6' else: sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') else: if data2[8] == 'REV': # --> the new scaffold could be inserted at the end dico_scaff[i][0].append(data2[4]) dico_scaff[i][1].append('REV') elif data2[8] == 'FWD': # --> the new scaffold could not be inserted dico_scaff[i][2].add(line2) # print 'toto7' else: sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') elif dico_scaff[i][1][-1] == 'REV': # --> if this scaffold is REV orientated if data2[5] == '1': if data2[8] == 'REV': # --> the new scaffold could be inserted at the end dico_scaff[i][0].append(data2[4]) dico_scaff[i][1].append(data2[7]) elif data2[8] == 'FWD': # --> the new scaffold could not be inserted dico_scaff[i][2].add(line2) # print 'toto8' else: sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') else: if data2[8] == 'FWD': # --> the new scaffold could be inserted at the end dico_scaff[i][0].append(data2[4]) dico_scaff[i][1].append('REV') elif data2[8] == 'REV': # --> the new scaffold could not be inserted dico_scaff[i][2].add(line2) # print 'toto9' else: sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') else: sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') elif data2[0] == dico_scaff[i][0][0]: # --> this scaffold is at the begining of the list if dico_scaff[i][1][0] == 'FWD': # --> if this scaffold is FWD orientated if data2[5] != '1': if data2[8] == 'FWD': # --> the new scaffold could be inserted at the begining dico_scaff[i][0].insert(0,data2[4]) dico_scaff[i][1].insert(0,data2[7]) elif data2[8] == 'REV': # --> the new scaffold could not be inserted dico_scaff[i][2].add(line2) # print 'toto10' else: sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') else: if data2[8] == 'REV': # --> the new scaffold could be inserted at begining dico_scaff[i][0].insert(0,data2[4]) dico_scaff[i][1].insert(0,'REV') elif data2[8] == 'FWD': # --> the new scaffold could not be inserted dico_scaff[i][2].add(line2) # print 'toto11' else: sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') elif dico_scaff[i][1][0] == 'REV': # --> if this scaffold is REV orientated if data2[5] == '1': if data2[8] == 'FWD': # --> the new scaffold could be inserted at the begining dico_scaff[i][0].insert(0,data2[4]) dico_scaff[i][1].insert(0,'REV') elif data2[8] == 'REV': # --> the new scaffold could not be inserted dico_scaff[i][2].add(line2) # print 'toto12' else: sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') else: if data2[8] == 'REV': # --> the new scaffold could be inserted at begining dico_scaff[i][0].insert(0,data2[4]) dico_scaff[i][1].insert(0,data2[7]) elif data2[8] == 'FWD': # --> the new scaffold could not be inserted dico_scaff[i][2].add(line2) # print 'toto13' else: sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') else: sys.exit('Unrecognized orientaion in column 8. Only FWD and REV are recognized') else: dico_scaff[i][2].add(line2) # print 'toto14' outfile = open(options.out,'w') for n in dico_scaff: liste = list(dico_scaff[n][0]) liste.sort() outfile.write('>'+liste[0]+'\n') # print '>'+liste[0] for k in dico_scaff[n][0]: outfile.write('\t'.join([k,dico_scaff[n][1][dico_scaff[n][0].index(k)]])+'\n') # print '\t'.join([k,dico_scaff[n][1][dico_scaff[n][0].index(k)]]) for k in dico_scaff[n][2]: data = k.split() if data[5] == '1': if data[3] == data[8]: outfile.write('not_grouped\t'+data[0]+'\tFWD\t'+data[4]+'\t'+data[7]+'\n') else: outfile.write('not_grouped\t'+data[0]+'\tREV\t'+data[4]+'\t'+data[7]+'\n') else: if data[3] == data[8]: outfile.write('not_grouped\t'+data[4]+'\t'+data[7]+'\t'+data[0]+'\tFWD'+'\n') else: outfile.write('not_grouped\t'+data[4]+'\t'+data[7]+'\t'+data[0]+'\tREV'+'\n') outfile.close() if __name__ == "__main__": __main__()
