gstf_preparation: gstf_preparation.py comparison

comparison gstf_preparation.py @ 6:56bbdbfe3eaa draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960

author	earlhaminst
date	Wed, 25 Apr 2018 11:00:33 -0400
parents	b3ba0c84667c
children	92f3966d5bc3

comparison

equal deleted inserted replaced

-:b3ba0c84667c
+:56bbdbfe3eaa
 import json
 import optparse
 import sqlite3
 import sys
-version = "0.3.0"
+version = "0.4.0"
 gene_count = 0
 Sequence = collections.namedtuple('Sequence', ['header', 'sequence'])
 (version, ))
 cur.execute('''CREATE TABLE gene (
 gene_id VARCHAR PRIMARY KEY NOT NULL,
 gene_symbol VARCHAR,
+seq_region_name VARCHAR NOT NULL,
+seq_region_start INTEGER NOT NULL,
+seq_region_end INTEGER NOT NULL,
+seq_region_strand INTEGER NOT NULL,
 species VARCHAR NOT NULL,
 gene_json VARCHAR NOT NULL)''')
 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)')
 cur.execute('''CREATE TABLE transcript (
 protein_id VARCHAR UNIQUE,
 protein_sequence VARCHAR,
 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''')
 cur.execute('''CREATE VIEW transcript_species AS
-SELECT transcript_id, species
+SELECT transcript_id, species, seq_region_name
 FROM transcript JOIN gene
 ON transcript.gene_id = gene.gene_id''')
 conn.commit()
 for gene in gene_dict.values():
 if gene is None:
 # This can happen when loading a JSON file from Ensembl
 continue
 gene_id = gene['id']
-cur.execute('INSERT INTO gene (gene_id, gene_symbol, species, gene_json) VALUES (?, ?, ?, ?)',
+cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?)',
-(gene_id, gene.get('display_name', None), gene['species'], json.dumps(gene)))
+(gene_id, gene.get('display_name', None), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], json.dumps(gene)))
 if "Transcript" in gene:
 for transcript in gene["Transcript"]:
 transcript_id = transcript['id']
 protein_id = transcript.get('Translation', {}).get('id', None)
 raise Exception("Error while inserting (%s, %s, %s) into transcript table: %s" % (transcript_id, protein_id, gene_id, e))
 conn.commit()
-def fetch_species_for_transcript(conn, transcript_id):
+def fetch_species_and_seq_region_for_transcript(conn, transcript_id):
 cur = conn.cursor()
-cur.execute('SELECT species FROM transcript_species WHERE transcript_id=?',
+cur.execute('SELECT species, seq_region_name FROM transcript_species WHERE transcript_id=?',
 (transcript_id, ))
 results = cur.fetchone()
 if not results:
 return None
-return results[0]
+return results
 def fetch_gene_id_for_transcript(conn, transcript_id):
 cur = conn.cursor()
 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files')
 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files')
 parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files')
 parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene')
 parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format')
+parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered')
 parser.add_option('-o', '--output', help='Path of the output SQLite file')
 parser.add_option('--of', help='Path of the output FASTA file')
+parser.add_option('--ff', help='Path of the filtered sequences output FASTA file')
 options, args = parser.parse_args()
 if args:
 raise Exception('Use options to provide inputs')
 conn = sqlite3.connect(options.output)
 # For each gene, select the transcript with the longest sequence
 # If more than one transcripts have the same longest sequence for a gene, the
 # first one to appear in the FASTA file is selected
 selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()]
-with open(options.of, 'w') as output_fasta_file:
+regions = [_.strip().lower() for _ in options.regions.split(",")]
+with open(options.of, 'w') as output_fasta_file, open(options.ff, 'w') as filtered_fasta_file:
 for fasta_arg in options.fasta:
 for entry in FASTAReader_gen(fasta_arg):
 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
 if options.longestCDS and transcript_id not in selected_transcript_ids:
 continue
-species_for_transcript = fetch_species_for_transcript(conn, transcript_id)
+species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id)
 if not species_for_transcript:
 print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
 continue
 if options.headers:
 # Remove any underscore in the species
 header = ">%s_%s" % (transcript_id, species_for_transcript.replace('_', ''))
 else:
 header = entry.header
-output_fasta_file.write("%s\n%s\n" % (header, entry.sequence))
+if seq_region_for_transcript.lower() in regions:
+filtered_fasta_file.write("%s\n%s\n" % (header, entry.sequence))
+else:
+output_fasta_file.write("%s\n%s\n" % (header, entry.sequence))
 conn.close()
 if __name__ == '__main__':

Mercurial > repos > earlhaminst > gstf_preparation

comparison gstf_preparation.py @ 6:56bbdbfe3eaa draft