retrieve_ensembl_bed: retrieve_ensembl

comparison retrieve_ensembl_bed.py @ 1:9c4a48f5d4e7 draft default tip

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/retrieve_ensembl_bed commit 6babd357845126292cb202aaea0f70ff68819525"

author	galaxyp
date	Mon, 07 Oct 2019 16:14:39 -0400
parents	da1b538b87e5
children

comparison

equal deleted inserted replaced

-:da1b538b87e5
+:9c4a48f5d4e7
 help='Ensembl Species to retrieve')
 parser.add_argument(
 '-R', '--regions', action='append', default=[],
 help='Restrict Ensembl retrieval to regions e.g.:'
 + ' X,2:20000-25000,3:100-500+')
+parser.add_argument(
+'-i', '--interval_file', default=None,
+help='Regions from a bed, gff, or interval file')
+parser.add_argument(
+'-f', '--interval_format', choices=['bed','gff','interval'], default='interval',
+help='Interval format has TAB-separated columns: Seq, Start, End, Strand')
 parser.add_argument(
 '-B', '--biotypes', action='append', default=[],
 help='Restrict Ensembl biotypes to retrieve')
 parser.add_argument(
 '-X', '--extended_bed', action='store_true', default=False,
 if chrom not in selected_regions:
 selected_regions[chrom] = []
 selected_regions[chrom].append([start, end, strand])
 if args.debug:
 print("selected_regions: %s" % selected_regions, file=sys.stderr)
+if args.interval_file:
+pat = r'^(?:chr)?([^\t]+)(?:\t(\d+)(?:\t(\d+)(?:\t([+-])?)?)?)?.*'
+if args.interval_format == 'bed':
+pat = r'^(?:chr)?([^\t]+)\t(\d+)\t(\d+)(?:(?:\t[^\t]+\t[^\t]+\t)([+-]))?.*'
+elif args.interval_format == 'gff':
+pat = r'^(?:chr)?([^\t]+)\t(\d+)\t(\d+)(?:(?:\t[^\t]+\t[^\t]+\t)([+-]))?.*'
+with open(args.interval_file,'r') as fh:
+for i, line in enumerate(fh):
+if line.startswith('#'):
+continue
+m = re.match(pat, line.rstrip())
+if m:
+(chrom, start, end, strand) = m.groups()
+if chrom:
+if chrom not in selected_regions:
+selected_regions[chrom] = []
+selected_regions[chrom].append([start, end, strand])
+if args.debug:
+print("selected_regions: %s" % selected_regions, file=sys.stderr)
 def retrieve_region(species, ref, start, stop, strand):
 transcript_count = 0
 regions = list(range(start, stop, max_region))
 if not regions or regions[-1] < stop:

Mercurial > repos > galaxyp > retrieve_ensembl_bed

comparison retrieve_ensembl_bed.py @ 1:9c4a48f5d4e7 draft default tip