Mercurial > repos > dereeper > ragoo
comparison RaGOO/lift_over.py @ 13:b9a3aeb162ab draft default tip
Uploaded
| author | dereeper |
|---|---|
| date | Mon, 26 Jul 2021 18:22:37 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 12:68a9ec9ce51e | 13:b9a3aeb162ab |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 def get_contig_lengths(in_fai): | |
| 3 """ Get contig lengths from a fasta index. """ | |
| 4 lens = dict() | |
| 5 with open(in_fai, 'r') as f: | |
| 6 for line in f: | |
| 7 L1 = line.rstrip().split('\t') | |
| 8 lens[L1[0]] = int(L1[1]) | |
| 9 return lens | |
| 10 | |
| 11 | |
| 12 def get_contig_orderings(in_groupings): | |
| 13 """ | |
| 14 From the orderings files, return the following: | |
| 15 | |
| 16 1. Dictionary associating a reference header with a list of ordered contig headers | |
| 17 2. Dictionary associating a contig with an orientation | |
| 18 3. Dicitonary associating a contig with its assigned reference header | |
| 19 """ | |
| 20 orderings = dict() | |
| 21 orientations = dict() | |
| 22 ref_chr = dict() | |
| 23 | |
| 24 # Iterate through the orderings files | |
| 25 with open(in_groupings, 'r') as f1: | |
| 26 for line1 in f1: | |
| 27 header = line1.rstrip().replace('_orderings.txt', '_RaGOO') | |
| 28 header = header[header.rfind('/') + 1:] | |
| 29 | |
| 30 # Initialize the list for the orderings | |
| 31 orderings[header] = [] | |
| 32 with open(line1.rstrip(), 'r') as f2: | |
| 33 for line2 in f2: | |
| 34 L1 = line2.rstrip().split('\t') | |
| 35 orderings[header].append(L1[0]) | |
| 36 orientations[L1[0]] = L1[1] | |
| 37 ref_chr[L1[0]] = header | |
| 38 return orderings, orientations, ref_chr | |
| 39 | |
| 40 | |
| 41 def get_reverse_coords(start_coord, end_coord, seq_length): | |
| 42 """ | |
| 43 Returns new genomic coordinates of a region that has undergone reverse complementation. | |
| 44 new start coordinate = seqLength - endCoord | |
| 45 new end coordinate = seqLength - startCoord | |
| 46 """ | |
| 47 return seq_length - (end_coord - 1), seq_length - (start_coord - 1) | |
| 48 | |
| 49 | |
| 50 if __name__ == "__main__": | |
| 51 import argparse | |
| 52 | |
| 53 parser = argparse.ArgumentParser(description='Lift-over gff coordinates to from contigs to RaGOO pseudomolecules.' | |
| 54 'Please make sure that you are using the updated gff file and set of contigs if chimera correction was used.' | |
| 55 'Also make sure that the gap size (-g) matches that which was used during ordering and orienting.') | |
| 56 parser.add_argument("gff", metavar="<genes.gff>", type=str, help="Gff file to be lifted-over") | |
| 57 parser.add_argument("orderings", metavar="<orderings.fofn>", type=str, help="List of RaGOO 'orderings' files. 'ls ragoo_output/orderings/* > orderings.fofn'") | |
| 58 parser.add_argument("fai", metavar="<contigs.fasta.fai>", type=str, help="Fasta index for contigs.'") | |
| 59 parser.add_argument("-g", metavar="100", type=int, default=100, help="Gap size for padding in pseudomolecules (must match what was used for 'ragoo.py'.") | |
| 60 | |
| 61 | |
| 62 # Get the command line arguments | |
| 63 args = parser.parse_args() | |
| 64 gff_file = args.gff | |
| 65 orderings_file = args.orderings | |
| 66 fai_file = args.fai | |
| 67 gap_size = args.g | |
| 68 | |
| 69 # Get the contig lengths | |
| 70 ctg_lens = get_contig_lengths(fai_file) | |
| 71 | |
| 72 # Get the contig orderings and orientations | |
| 73 ctg_orderings, ctg_orientations, ctg_chr = get_contig_orderings(orderings_file) | |
| 74 | |
| 75 #Iterate through the GFF features and update | |
| 76 offsets = dict() | |
| 77 with open(gff_file) as f: | |
| 78 for gff_line in f: | |
| 79 if gff_line.startswith("#"): | |
| 80 print(gff_line.rstrip()) | |
| 81 else: | |
| 82 gff_fields = gff_line.rstrip().split('\t') | |
| 83 gff_header, start, end, strand = gff_fields[0], int(gff_fields[3]), int(gff_fields[4]), gff_fields[6] | |
| 84 new_header = ctg_chr[gff_header] | |
| 85 | |
| 86 # Check that this contig header is in our list of headers | |
| 87 if gff_header not in ctg_lens.keys(): | |
| 88 err_msg = """ %s was not found in the list of orderings files provided. | |
| 89 Please check that you are using the correct files. If chimeric contig correction was | |
| 90 used, please use the corrected gff and fai file. | |
| 91 """ | |
| 92 raise ValueError(err_msg) | |
| 93 | |
| 94 # Check if the contig has been reverse complemented. Update accordingly | |
| 95 if ctg_orientations[gff_header] == '-': | |
| 96 start, end = get_reverse_coords(start, end, ctg_lens[gff_header]) | |
| 97 | |
| 98 # Set the opposite strand | |
| 99 if strand == '+': | |
| 100 strand = '-' | |
| 101 else: | |
| 102 strand = '+' | |
| 103 | |
| 104 # Check if the offset for this contig has already been calculated | |
| 105 if gff_header in offsets: | |
| 106 offset = offsets[gff_header] | |
| 107 else: | |
| 108 ctg_idx = ctg_orderings[new_header].index(gff_header) | |
| 109 offset = 0 | |
| 110 | |
| 111 for ctg in ctg_orderings[new_header][:ctg_idx]: | |
| 112 offset += ctg_lens[ctg] | |
| 113 offset += gap_size | |
| 114 | |
| 115 # memoize the offset | |
| 116 offsets[gff_header] = offset | |
| 117 | |
| 118 new_start = start + offset | |
| 119 new_end = end + offset | |
| 120 | |
| 121 gff_fields[0] = new_header | |
| 122 gff_fields[3] = str(new_start) | |
| 123 gff_fields[4] = str(new_end) | |
| 124 gff_fields[6] = strand | |
| 125 print('\t'.join(gff_fields)) |
