diff gstf_preparation.py @ 10:e8e75a79de59 draft

"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9c8611fee927883f50bc6955771aa69df1ce8457"
author earlhaminst
date Thu, 31 Oct 2019 08:16:51 -0400
parents f4acbfe8d6fe
children dbe37a658cd2
line wrap: on
line diff
--- a/gstf_preparation.py	Wed Oct 17 07:31:29 2018 -0400
+++ b/gstf_preparation.py	Thu Oct 31 08:16:51 2019 -0400
@@ -2,6 +2,7 @@
 
 import json
 import optparse
+import os
 import sqlite3
 import sys
 
@@ -114,10 +115,7 @@
         # a 5' UTR can be split among multiple exons
         # a CDS can be part of multiple transcripts
         for parent in d['Parent'].split(','):
-            if parent not in parent_dict:
-                parent_dict[parent] = [d]
-            else:
-                parent_dict[parent].append(d)
+            parent_dict.setdefault(parent, []).append(d)
     return d
 
 
@@ -139,6 +137,8 @@
 
 def add_transcript_to_dict(cols, species, transcript_dict):
     transcript = feature_to_dict(cols)
+    if 'biotype' in transcript and transcript['biotype'] != 'protein_coding':
+        return
     transcript.update({
         'object_type': 'Transcript',
         'seq_region_name': cols[0],
@@ -302,7 +302,7 @@
     parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered')
     parser.add_option('-o', '--output', help='Path of the output SQLite file')
     parser.add_option('--of', help='Path of the output FASTA file')
-    parser.add_option('--ff', help='Path of the filtered sequences output FASTA file')
+    parser.add_option('--ff', default=os.devnull, help='Path of the filtered sequences output FASTA file')
 
     options, args = parser.parse_args()
     if args:
@@ -403,10 +403,7 @@
             else:
                 break
 
-            if gene_id in gene_transcripts_dict:
-                gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence)))
-            else:
-                gene_transcripts_dict[gene_id] = [(transcript_id, len(entry.sequence))]
+            gene_transcripts_dict.setdefault(gene_id, []).append((transcript_id, len(entry.sequence)))
 
     if options.longestCDS:
         # For each gene, select the transcript with the longest sequence.