Mercurial > repos > earlhaminst > gstf_preparation
diff gstf_preparation.py @ 6:56bbdbfe3eaa draft
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
author | earlhaminst |
---|---|
date | Wed, 25 Apr 2018 11:00:33 -0400 |
parents | b3ba0c84667c |
children | 92f3966d5bc3 |
line wrap: on
line diff
--- a/gstf_preparation.py Mon Apr 16 14:05:09 2018 -0400 +++ b/gstf_preparation.py Wed Apr 25 11:00:33 2018 -0400 @@ -6,7 +6,7 @@ import sqlite3 import sys -version = "0.3.0" +version = "0.4.0" gene_count = 0 Sequence = collections.namedtuple('Sequence', ['header', 'sequence']) @@ -41,6 +41,10 @@ cur.execute('''CREATE TABLE gene ( gene_id VARCHAR PRIMARY KEY NOT NULL, gene_symbol VARCHAR, + seq_region_name VARCHAR NOT NULL, + seq_region_start INTEGER NOT NULL, + seq_region_end INTEGER NOT NULL, + seq_region_strand INTEGER NOT NULL, species VARCHAR NOT NULL, gene_json VARCHAR NOT NULL)''') cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)') @@ -52,7 +56,7 @@ gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''') cur.execute('''CREATE VIEW transcript_species AS - SELECT transcript_id, species + SELECT transcript_id, species, seq_region_name FROM transcript JOIN gene ON transcript.gene_id = gene.gene_id''') @@ -225,8 +229,8 @@ # This can happen when loading a JSON file from Ensembl continue gene_id = gene['id'] - cur.execute('INSERT INTO gene (gene_id, gene_symbol, species, gene_json) VALUES (?, ?, ?, ?)', - (gene_id, gene.get('display_name', None), gene['species'], json.dumps(gene))) + cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?)', + (gene_id, gene.get('display_name', None), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], json.dumps(gene))) if "Transcript" in gene: for transcript in gene["Transcript"]: @@ -241,15 +245,15 @@ conn.commit() -def fetch_species_for_transcript(conn, transcript_id): +def fetch_species_and_seq_region_for_transcript(conn, transcript_id): cur = conn.cursor() - cur.execute('SELECT species FROM transcript_species WHERE transcript_id=?', + cur.execute('SELECT species, seq_region_name FROM transcript_species WHERE transcript_id=?', (transcript_id, )) results = cur.fetchone() if not results: return None - return results[0] + return results def fetch_gene_id_for_transcript(conn, transcript_id): @@ -280,8 +284,11 @@ parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files') parser.add_option('-l', action='store_true', default=False, dest='longestCDS', help='Keep only the longest CDS per gene') parser.add_option('--headers', action='store_true', default=False, help='Change the header line of the FASTA sequences to the >TranscriptId_species format') + parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered') parser.add_option('-o', '--output', help='Path of the output SQLite file') parser.add_option('--of', help='Path of the output FASTA file') + parser.add_option('--ff', help='Path of the filtered sequences output FASTA file') + options, args = parser.parse_args() if args: raise Exception('Use options to provide inputs') @@ -368,14 +375,15 @@ # first one to appear in the FASTA file is selected selected_transcript_ids = [max(transcript_id_lengths, key=lambda _: _[1])[0] for transcript_id_lengths in gene_transcripts_dict.values()] - with open(options.of, 'w') as output_fasta_file: + regions = [_.strip().lower() for _ in options.regions.split(",")] + with open(options.of, 'w') as output_fasta_file, open(options.ff, 'w') as filtered_fasta_file: for fasta_arg in options.fasta: for entry in FASTAReader_gen(fasta_arg): transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0]) if options.longestCDS and transcript_id not in selected_transcript_ids: continue - species_for_transcript = fetch_species_for_transcript(conn, transcript_id) + species_for_transcript, seq_region_for_transcript = fetch_species_and_seq_region_for_transcript(conn, transcript_id) if not species_for_transcript: print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr) continue @@ -387,7 +395,10 @@ else: header = entry.header - output_fasta_file.write("%s\n%s\n" % (header, entry.sequence)) + if seq_region_for_transcript.lower() in regions: + filtered_fasta_file.write("%s\n%s\n" % (header, entry.sequence)) + else: + output_fasta_file.write("%s\n%s\n" % (header, entry.sequence)) conn.close()