Mercurial > repos > portiahollyoak > genbank_to_fasta
comparison genbank_to_fasta.py @ 0:bcdd1a35e545 draft default tip
planemo upload for repository https://github.com/portiahollyoak/Tools commit 132bb96bba8e7aed66a102ed93b7744f36d10d37-dirty
| author | portiahollyoak |
|---|---|
| date | Fri, 22 Apr 2016 12:09:14 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:bcdd1a35e545 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # coding: utf-8 | |
| 3 | |
| 4 import argparse | |
| 5 import doctest # This will test if the functions are working | |
| 6 | |
| 7 | |
| 8 def get_id(line): | |
| 9 """ | |
| 10 This function reads a line and returns the ID name | |
| 11 | |
| 12 >>> line = 'ID TE standard; DNA; INV; 7411 BP.' | |
| 13 >>> 'TE'== get_id(line) | |
| 14 True | |
| 15 | |
| 16 """ | |
| 17 if line.startswith("ID"): | |
| 18 id = line.split(" ")[1] #split line into 'ID' and rest of line, take rest of line and define as id | |
| 19 id = id.split(" ")[0] #split id into 'ID name' and rest of line, take ID name and define as id | |
| 20 return id | |
| 21 | |
| 22 | |
| 23 def get_seq(line): | |
| 24 """ | |
| 25 This function reads a sequence line from a genbank file | |
| 26 and returns a sequence with no spaces or digits | |
| 27 | |
| 28 >>> line = "AGTGACATAT TCACATACAA AACCACATAA CATAGAGTAA ACATATTGAA AAGCCGCATA 60" | |
| 29 >>> 'AGTGACATATTCACATACAAAACCACATAACATAGAGTAAACATATTGAAAAGCCGCATA' == get_seq(line) | |
| 30 True | |
| 31 | |
| 32 """ | |
| 33 seq = [] | |
| 34 for char in line: | |
| 35 if not char.isdigit() and not char == " ": # If a character is not a digit or space, | |
| 36 # it will be added to sequence. | |
| 37 seq.append(char) | |
| 38 seq = "".join(seq) | |
| 39 return seq | |
| 40 | |
| 41 | |
| 42 def make_seq_dictionary(input_file_handle): | |
| 43 """ | |
| 44 This function loops over a multi genbank file and returns | |
| 45 a collection of ID and corresponding sequence in a dictionary. | |
| 46 """ | |
| 47 seq_d = {} # dictionary with id as key and sequence as value | |
| 48 next_line_is_seq = False | |
| 49 for line in input_file_handle: | |
| 50 line = line.strip() # strips any leading or trailing whitespace | |
| 51 if line.startswith("ID"): | |
| 52 id = get_id(line) | |
| 53 seq_d[id]="" # We just create a new key | |
| 54 if line.startswith("SQ"): | |
| 55 next_line_is_seq = True # If line starts with 'SQ' then state is true | |
| 56 continue | |
| 57 if line.startswith("//"): # If line starts with '//' then state is false | |
| 58 next_line_is_seq = False | |
| 59 if next_line_is_seq: # Whatever has been read as true, this is copied to file | |
| 60 seq = get_seq(line) | |
| 61 seq_d[id] += seq | |
| 62 return seq_d | |
| 63 | |
| 64 | |
| 65 def write_seq_d_to_file(seq_d, output): | |
| 66 """ | |
| 67 This function will write the sequence dictionary to an output file | |
| 68 """ | |
| 69 for transposon, seq in seq_d.items(): | |
| 70 output.write(">%s\n" % transposon) | |
| 71 output.write("%s\n" % seq) | |
| 72 | |
| 73 description = ( "This script will extract ID names and sequences from a multigenbank" | |
| 74 "file and format them into a multifasta file." ) | |
| 75 | |
| 76 | |
| 77 parser = argparse.ArgumentParser(description) | |
| 78 parser.add_argument("input", help="A multi-genbank file.") | |
| 79 parser.add_argument("output", help="Name of the output fasta file.") | |
| 80 args = parser.parse_args() | |
| 81 | |
| 82 try: | |
| 83 with open(args.input, encoding = "utf-8") as input_file_handle: | |
| 84 # This will perform the tasks | |
| 85 seq_d = make_seq_dictionary(input_file_handle) | |
| 86 except TypeError: | |
| 87 with open(args.input) as input_file_handle: | |
| 88 seq_d = make_seq_dictionary(input_file_handle) | |
| 89 | |
| 90 with open(args.output, "w") as output: | |
| 91 write_seq_d_to_file(seq_d, output) | |
| 92 |
