comparison rename.py @ 4:e35a3509c160 draft

planemo upload commit 3675b4447d11fb1cd75d505886e1bf693f9d07f5
author yating-l
date Thu, 26 Jul 2018 15:46:24 -0400
parents 9529a207d704
children 7c8b327f298c
comparison
equal deleted inserted replaced
3:9529a207d704 4:e35a3509c160
1 """ 1 """
2 Call rename to rename scaffolds in reference genome so that the sequence names are less than 31 characters. Rename all scaffolds to scaffold_1, scaffold_2, ..., scaffold_N and provide a name mapping file 2 Call rename to rename scaffolds in reference genome so that the sequence names are less than 31 characters. Rename all scaffolds to scaffold_1, scaffold_2, ..., scaffold_N and provide a name mapping file
3 Call truncate to truncate the scaffold names that are more than 31 characters. Replace non-ASCII character with '_' 3 Call truncate to truncate the scaffold names that are more than 31 characters. Replace each invalid character (non-ASCII, '\t', '\n', '\x0b', '\x0c', '\r') with '_'
4 """ 4 """
5 import sys 5 import sys
6 import csv 6 import csv
7 import codecs 7 import codecs
8 import string 8 import string
19 line = ">" + newname + "\n" 19 line = ">" + newname + "\n"
20 i = i+1 20 i = i+1
21 writer.writerow([oldname.encode('utf-8'), newname]) 21 writer.writerow([oldname.encode('utf-8'), newname])
22 out.write(line) 22 out.write(line)
23 23
24 def truncate(inputFile, outputFile): 24 def truncate(inputFile, outputFile, valid_characters):
25 names = [] 25 names = []
26 with codecs.open(outputFile, 'w', encoding='utf-8') as out: 26 with codecs.open(outputFile, 'w', encoding='utf-8') as out:
27 with codecs.open(inputFile, 'r', encoding='utf-8') as rf: 27 with codecs.open(inputFile, 'r', encoding='utf-8') as rf:
28 lines = rf.readlines() 28 lines = rf.readlines()
29 for l in lines: 29 for l in lines:
30 if ">" in l: 30 if ">" in l:
31 print l.encode('utf-8') 31 print l.encode('utf-8')
32 name = l[1:].rstrip() 32 name = l[1:].rstrip()
33 name = substituteNonAscii(name) 33 name = substituteNonAscii(name, valid_characters)
34 if len(name) > 31: 34 if len(name) > 31:
35 name = name[:31] 35 name = name[:31]
36 print "\tTruncate the scaffold name to less than 31 characters: %s" % name 36 print "\tTruncate the scaffold name to less than 31 characters: %s" % name
37 if name in names: 37 if name in names:
38 sys.exit("Name conflict! Name " + name + " already exist.") 38 sys.exit("Name conflict! Name " + name + " already exist.")
39 names.append(name) 39 names.append(name)
40 l = ">" + name + "\n" 40 l = ">" + name + "\n"
41 print "======================\n" 41 print "======================\n"
42 out.write(l) 42 out.write(l)
43 43
44 def substituteNonAscii(str): 44 def substituteNonAscii(str, valid_charaters):
45 l = [] 45 l = []
46 for c in str: 46 for c in str:
47 if c not in string.printable: 47 if c not in valid_charaters:
48 print "\tSubstitute Non-ASCII character %s with _" % c.encode('utf-8') 48 print "\tSubstitute invalid character %s with _" % c.encode('utf-8')
49 c = '_' 49 c = '_'
50 l.append(c) 50 l.append(c)
51 return "".join(l) 51 return "".join(l)
52 52
53 def main(): 53 def main():
54 inputfile = str(sys.argv[1]) 54 inputfile = str(sys.argv[1])
55 manipulate = str(sys.argv[2]) 55 manipulate = str(sys.argv[2])
56 outputfile = str(sys.argv[3]) 56 outputfile = str(sys.argv[3])
57 valid_characters = string.letters + string.punctuation + string.digits + ' '
57 if manipulate == "rename": 58 if manipulate == "rename":
58 indexfile = str(sys.argv[4]) 59 indexfile = str(sys.argv[4])
59 csvfile = open(indexfile, 'w') 60 csvfile = open(indexfile, 'w')
60 writer = csv.writer(csvfile) 61 writer = csv.writer(csvfile)
61 rename(inputfile, outputfile, writer) 62 rename(inputfile, outputfile, writer)
62 elif manipulate == "truncate": 63 elif manipulate == "truncate":
63 truncate(inputfile, outputfile) 64 truncate(inputfile, outputfile, valid_characters)
64 65
65 if __name__ == "__main__": 66 if __name__ == "__main__":
66 main() 67 main()
67 68
68 69