Mercurial > repos > galaxyp > retrieve_ensembl_bed
comparison retrieve_ensembl_bed.py @ 1:9c4a48f5d4e7 draft default tip
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/retrieve_ensembl_bed commit 6babd357845126292cb202aaea0f70ff68819525"
author | galaxyp |
---|---|
date | Mon, 07 Oct 2019 16:14:39 -0400 |
parents | da1b538b87e5 |
children |
comparison
equal
deleted
inserted
replaced
0:da1b538b87e5 | 1:9c4a48f5d4e7 |
---|---|
31 help='Ensembl Species to retrieve') | 31 help='Ensembl Species to retrieve') |
32 parser.add_argument( | 32 parser.add_argument( |
33 '-R', '--regions', action='append', default=[], | 33 '-R', '--regions', action='append', default=[], |
34 help='Restrict Ensembl retrieval to regions e.g.:' | 34 help='Restrict Ensembl retrieval to regions e.g.:' |
35 + ' X,2:20000-25000,3:100-500+') | 35 + ' X,2:20000-25000,3:100-500+') |
36 parser.add_argument( | |
37 '-i', '--interval_file', default=None, | |
38 help='Regions from a bed, gff, or interval file') | |
39 parser.add_argument( | |
40 '-f', '--interval_format', choices=['bed','gff','interval'], default='interval', | |
41 help='Interval format has TAB-separated columns: Seq, Start, End, Strand') | |
36 parser.add_argument( | 42 parser.add_argument( |
37 '-B', '--biotypes', action='append', default=[], | 43 '-B', '--biotypes', action='append', default=[], |
38 help='Restrict Ensembl biotypes to retrieve') | 44 help='Restrict Ensembl biotypes to retrieve') |
39 parser.add_argument( | 45 parser.add_argument( |
40 '-X', '--extended_bed', action='store_true', default=False, | 46 '-X', '--extended_bed', action='store_true', default=False, |
72 if chrom not in selected_regions: | 78 if chrom not in selected_regions: |
73 selected_regions[chrom] = [] | 79 selected_regions[chrom] = [] |
74 selected_regions[chrom].append([start, end, strand]) | 80 selected_regions[chrom].append([start, end, strand]) |
75 if args.debug: | 81 if args.debug: |
76 print("selected_regions: %s" % selected_regions, file=sys.stderr) | 82 print("selected_regions: %s" % selected_regions, file=sys.stderr) |
83 | |
84 if args.interval_file: | |
85 pat = r'^(?:chr)?([^\t]+)(?:\t(\d+)(?:\t(\d+)(?:\t([+-])?)?)?)?.*' | |
86 if args.interval_format == 'bed': | |
87 pat = r'^(?:chr)?([^\t]+)\t(\d+)\t(\d+)(?:(?:\t[^\t]+\t[^\t]+\t)([+-]))?.*' | |
88 elif args.interval_format == 'gff': | |
89 pat = r'^(?:chr)?([^\t]+)\t(\d+)\t(\d+)(?:(?:\t[^\t]+\t[^\t]+\t)([+-]))?.*' | |
90 with open(args.interval_file,'r') as fh: | |
91 for i, line in enumerate(fh): | |
92 if line.startswith('#'): | |
93 continue | |
94 m = re.match(pat, line.rstrip()) | |
95 if m: | |
96 (chrom, start, end, strand) = m.groups() | |
97 if chrom: | |
98 if chrom not in selected_regions: | |
99 selected_regions[chrom] = [] | |
100 selected_regions[chrom].append([start, end, strand]) | |
101 if args.debug: | |
102 print("selected_regions: %s" % selected_regions, file=sys.stderr) | |
103 | |
77 | 104 |
78 def retrieve_region(species, ref, start, stop, strand): | 105 def retrieve_region(species, ref, start, stop, strand): |
79 transcript_count = 0 | 106 transcript_count = 0 |
80 regions = list(range(start, stop, max_region)) | 107 regions = list(range(start, stop, max_region)) |
81 if not regions or regions[-1] < stop: | 108 if not regions or regions[-1] < stop: |