comparison retrieve_ensembl_bed.py @ 1:9c4a48f5d4e7 draft default tip

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/retrieve_ensembl_bed commit 6babd357845126292cb202aaea0f70ff68819525"
author galaxyp
date Mon, 07 Oct 2019 16:14:39 -0400
parents da1b538b87e5
children
comparison
equal deleted inserted replaced
0:da1b538b87e5 1:9c4a48f5d4e7
31 help='Ensembl Species to retrieve') 31 help='Ensembl Species to retrieve')
32 parser.add_argument( 32 parser.add_argument(
33 '-R', '--regions', action='append', default=[], 33 '-R', '--regions', action='append', default=[],
34 help='Restrict Ensembl retrieval to regions e.g.:' 34 help='Restrict Ensembl retrieval to regions e.g.:'
35 + ' X,2:20000-25000,3:100-500+') 35 + ' X,2:20000-25000,3:100-500+')
36 parser.add_argument(
37 '-i', '--interval_file', default=None,
38 help='Regions from a bed, gff, or interval file')
39 parser.add_argument(
40 '-f', '--interval_format', choices=['bed','gff','interval'], default='interval',
41 help='Interval format has TAB-separated columns: Seq, Start, End, Strand')
36 parser.add_argument( 42 parser.add_argument(
37 '-B', '--biotypes', action='append', default=[], 43 '-B', '--biotypes', action='append', default=[],
38 help='Restrict Ensembl biotypes to retrieve') 44 help='Restrict Ensembl biotypes to retrieve')
39 parser.add_argument( 45 parser.add_argument(
40 '-X', '--extended_bed', action='store_true', default=False, 46 '-X', '--extended_bed', action='store_true', default=False,
72 if chrom not in selected_regions: 78 if chrom not in selected_regions:
73 selected_regions[chrom] = [] 79 selected_regions[chrom] = []
74 selected_regions[chrom].append([start, end, strand]) 80 selected_regions[chrom].append([start, end, strand])
75 if args.debug: 81 if args.debug:
76 print("selected_regions: %s" % selected_regions, file=sys.stderr) 82 print("selected_regions: %s" % selected_regions, file=sys.stderr)
83
84 if args.interval_file:
85 pat = r'^(?:chr)?([^\t]+)(?:\t(\d+)(?:\t(\d+)(?:\t([+-])?)?)?)?.*'
86 if args.interval_format == 'bed':
87 pat = r'^(?:chr)?([^\t]+)\t(\d+)\t(\d+)(?:(?:\t[^\t]+\t[^\t]+\t)([+-]))?.*'
88 elif args.interval_format == 'gff':
89 pat = r'^(?:chr)?([^\t]+)\t(\d+)\t(\d+)(?:(?:\t[^\t]+\t[^\t]+\t)([+-]))?.*'
90 with open(args.interval_file,'r') as fh:
91 for i, line in enumerate(fh):
92 if line.startswith('#'):
93 continue
94 m = re.match(pat, line.rstrip())
95 if m:
96 (chrom, start, end, strand) = m.groups()
97 if chrom:
98 if chrom not in selected_regions:
99 selected_regions[chrom] = []
100 selected_regions[chrom].append([start, end, strand])
101 if args.debug:
102 print("selected_regions: %s" % selected_regions, file=sys.stderr)
103
77 104
78 def retrieve_region(species, ref, start, stop, strand): 105 def retrieve_region(species, ref, start, stop, strand):
79 transcript_count = 0 106 transcript_count = 0
80 regions = list(range(start, stop, max_region)) 107 regions = list(range(start, stop, max_region))
81 if not regions or regions[-1] < stop: 108 if not regions or regions[-1] < stop: