gstf_preparation: gstf_preparation.py comparison

comparison gstf_preparation.py @ 5:b3ba0c84667c draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae

author	earlhaminst
date	Mon, 16 Apr 2018 14:05:09 -0400
parents	284f64ad9d43
children	56bbdbfe3eaa

comparison

equal deleted inserted replaced

-:284f64ad9d43
+:b3ba0c84667c
 derived_translation_end = None
 if transcript_id in cds_parent_dict:
 cds_list = cds_parent_dict[transcript_id]
 cds_ids = set(_['id'] for _ in cds_list)
 if len(cds_ids) > 1:
-raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % parent)
+raise Exception("Transcript %s has multiple CDSs: this is not supported by Ensembl JSON format" % transcript_id)
-translation['id'] = cds_ids.pop()
+cds_id = cds_ids.pop()
+translation['id'] = cds_id
 cds_list.sort(key=lambda _: _['start'])
 translation['CDS'] = cds_list
 translation['start'] = cds_list[0]['start']
 translation['end'] = cds_list[-1]['end']
 found_cds = True
 else:
 derived_translation_start = three_prime_utr_list[-1]['end'] + 1
 if derived_translation_start is not None:
 if found_cds:
 if derived_translation_start > translation['start']:
-raise Exception("UTR overlaps with CDS")
+raise Exception("Transcript %s has the start of CDS %s overlapping with the UTR end" % (transcript_id, cds_id))
 else:
 translation['start'] = derived_translation_start
 if derived_translation_end is not None:
 if found_cds:
 if derived_translation_end < translation['end']:
-raise Exception("UTR overlaps with CDS")
+raise Exception("Transcript %s has the end of CDS %s overlapping with the UTR start" % (transcript_id, cds_id))
 else:
 translation['end'] = derived_translation_end
 if found_cds or derived_translation_start is not None or derived_translation_end is not None:
 transcript['Translation'] = translation
 transcript_dict = dict()
 exon_parent_dict = dict()
 cds_parent_dict = dict()
 five_prime_utr_parent_dict = dict()
 three_prime_utr_parent_dict = dict()
+unimplemented_feature_nlines_dict = dict()
 with open(filename) as f:
 for i, line in enumerate(f, start=1):
 line = line.strip()
 if not line:
 feature_to_dict(cols, five_prime_utr_parent_dict)
 elif feature_type == 'three_prime_UTR':
 feature_to_dict(cols, three_prime_utr_parent_dict)
 elif feature_type == 'CDS':
 add_cds_to_dict(cols, cds_parent_dict)
+elif feature_type in unimplemented_feature_nlines_dict:
+unimplemented_feature_nlines_dict[feature_type] += 1
 else:
-print("Line %i in file '%s': '%s' is not an implemented feature type" % (i, filename, feature_type), file=sys.stderr)
+unimplemented_feature_nlines_dict[feature_type] = 0
 except Exception as e:
 print("Line %i in file '%s': %s" % (i, filename, e), file=sys.stderr)
+for unimplemented_feature, nlines in unimplemented_feature_nlines_dict.items():
+print("Skipped %d lines in file '%s': '%s' is not an implemented feature type" % (nlines, filename, unimplemented_feature), file=sys.stderr)
 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict)
 write_gene_dict_to_db(conn, gene_dict)
 for json_arg in options.json:
 # Extract the transcript id by removing everything after the first space and then removing the version if it is an Ensembl id
 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0])
 gene_id = fetch_gene_id_for_transcript(conn, transcript_id)
 if not gene_id:
+print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
 continue
 if gene_id in gene_transcripts_dict:
 gene_transcripts_dict[gene_id].append((transcript_id, len(entry.sequence)))
 else:
 if options.longestCDS and transcript_id not in selected_transcript_ids:
 continue
 species_for_transcript = fetch_species_for_transcript(conn, transcript_id)
 if not species_for_transcript:
-print("Transcript '%s' not found in the gene feature information" % transcript_id, file=sys.stderr)
+print("Transcript '%s' in file '%s' not found in the gene feature information" % (transcript_id, fasta_arg), file=sys.stderr)
 continue
 if options.headers:
 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest
 # Remove any underscore in the species

Mercurial > repos > earlhaminst > gstf_preparation

comparison gstf_preparation.py @ 5:b3ba0c84667c draft