Mercurial > repos > yating-l > rename_scaffolds
comparison rename.py @ 4:e35a3509c160 draft
planemo upload commit 3675b4447d11fb1cd75d505886e1bf693f9d07f5
author | yating-l |
---|---|
date | Thu, 26 Jul 2018 15:46:24 -0400 |
parents | 9529a207d704 |
children | 7c8b327f298c |
comparison
equal
deleted
inserted
replaced
3:9529a207d704 | 4:e35a3509c160 |
---|---|
1 """ | 1 """ |
2 Call rename to rename scaffolds in reference genome so that the sequence names are less than 31 characters. Rename all scaffolds to scaffold_1, scaffold_2, ..., scaffold_N and provide a name mapping file | 2 Call rename to rename scaffolds in reference genome so that the sequence names are less than 31 characters. Rename all scaffolds to scaffold_1, scaffold_2, ..., scaffold_N and provide a name mapping file |
3 Call truncate to truncate the scaffold names that are more than 31 characters. Replace non-ASCII character with '_' | 3 Call truncate to truncate the scaffold names that are more than 31 characters. Replace each invalid character (non-ASCII, '\t', '\n', '\x0b', '\x0c', '\r') with '_' |
4 """ | 4 """ |
5 import sys | 5 import sys |
6 import csv | 6 import csv |
7 import codecs | 7 import codecs |
8 import string | 8 import string |
19 line = ">" + newname + "\n" | 19 line = ">" + newname + "\n" |
20 i = i+1 | 20 i = i+1 |
21 writer.writerow([oldname.encode('utf-8'), newname]) | 21 writer.writerow([oldname.encode('utf-8'), newname]) |
22 out.write(line) | 22 out.write(line) |
23 | 23 |
24 def truncate(inputFile, outputFile): | 24 def truncate(inputFile, outputFile, valid_characters): |
25 names = [] | 25 names = [] |
26 with codecs.open(outputFile, 'w', encoding='utf-8') as out: | 26 with codecs.open(outputFile, 'w', encoding='utf-8') as out: |
27 with codecs.open(inputFile, 'r', encoding='utf-8') as rf: | 27 with codecs.open(inputFile, 'r', encoding='utf-8') as rf: |
28 lines = rf.readlines() | 28 lines = rf.readlines() |
29 for l in lines: | 29 for l in lines: |
30 if ">" in l: | 30 if ">" in l: |
31 print l.encode('utf-8') | 31 print l.encode('utf-8') |
32 name = l[1:].rstrip() | 32 name = l[1:].rstrip() |
33 name = substituteNonAscii(name) | 33 name = substituteNonAscii(name, valid_characters) |
34 if len(name) > 31: | 34 if len(name) > 31: |
35 name = name[:31] | 35 name = name[:31] |
36 print "\tTruncate the scaffold name to less than 31 characters: %s" % name | 36 print "\tTruncate the scaffold name to less than 31 characters: %s" % name |
37 if name in names: | 37 if name in names: |
38 sys.exit("Name conflict! Name " + name + " already exist.") | 38 sys.exit("Name conflict! Name " + name + " already exist.") |
39 names.append(name) | 39 names.append(name) |
40 l = ">" + name + "\n" | 40 l = ">" + name + "\n" |
41 print "======================\n" | 41 print "======================\n" |
42 out.write(l) | 42 out.write(l) |
43 | 43 |
44 def substituteNonAscii(str): | 44 def substituteNonAscii(str, valid_charaters): |
45 l = [] | 45 l = [] |
46 for c in str: | 46 for c in str: |
47 if c not in string.printable: | 47 if c not in valid_charaters: |
48 print "\tSubstitute Non-ASCII character %s with _" % c.encode('utf-8') | 48 print "\tSubstitute invalid character %s with _" % c.encode('utf-8') |
49 c = '_' | 49 c = '_' |
50 l.append(c) | 50 l.append(c) |
51 return "".join(l) | 51 return "".join(l) |
52 | 52 |
53 def main(): | 53 def main(): |
54 inputfile = str(sys.argv[1]) | 54 inputfile = str(sys.argv[1]) |
55 manipulate = str(sys.argv[2]) | 55 manipulate = str(sys.argv[2]) |
56 outputfile = str(sys.argv[3]) | 56 outputfile = str(sys.argv[3]) |
57 valid_characters = string.letters + string.punctuation + string.digits + ' ' | |
57 if manipulate == "rename": | 58 if manipulate == "rename": |
58 indexfile = str(sys.argv[4]) | 59 indexfile = str(sys.argv[4]) |
59 csvfile = open(indexfile, 'w') | 60 csvfile = open(indexfile, 'w') |
60 writer = csv.writer(csvfile) | 61 writer = csv.writer(csvfile) |
61 rename(inputfile, outputfile, writer) | 62 rename(inputfile, outputfile, writer) |
62 elif manipulate == "truncate": | 63 elif manipulate == "truncate": |
63 truncate(inputfile, outputfile) | 64 truncate(inputfile, outputfile, valid_characters) |
64 | 65 |
65 if __name__ == "__main__": | 66 if __name__ == "__main__": |
66 main() | 67 main() |
67 | 68 |
68 | 69 |