| 0 | 1 #!/usr/bin/env python | 
|  | 2 ''' interlacing two fastq sequences''' | 
|  | 3 import sys | 
|  | 4 | 
|  | 5 def readSingleSeq(file): | 
|  | 6     ''' read single seq from fasta file''' | 
|  | 7     line = file.readline() | 
|  | 8     if not line: | 
|  | 9         return False  # end of file | 
|  | 10     if line[0] != ">": | 
|  | 11         raise Exception("no header on the first line") | 
|  | 12     seqname = line[1:].strip() | 
|  | 13     seq = "" | 
|  | 14     # read sequences | 
|  | 15     while True: | 
|  | 16         last_pos = file.tell() | 
|  | 17         line = file.readline() | 
|  | 18         if not line: | 
|  | 19             break | 
|  | 20         if line[0] == ">": | 
|  | 21             file.seek(last_pos) | 
|  | 22             break | 
|  | 23         seq = seq + line.strip() | 
|  | 24     return {'name': seqname, 'sequence': seq} | 
|  | 25 | 
|  | 26 | 
|  | 27 def writeSingleSeq(fileobject, seq): | 
|  | 28     ''' write single sequence to fasta file''' | 
|  | 29     fileobject.write(">") | 
|  | 30     fileobject.write(seq['name'] + "\n") | 
|  | 31     fileobject.write(seq['sequence'] + "\n") | 
|  | 32 | 
|  | 33 | 
|  | 34 def main(): | 
|  | 35     from optparse import OptionParser | 
|  | 36     parser = OptionParser() | 
|  | 37     parser.add_option("-a", | 
|  | 38                       "--fasta_file_A", | 
|  | 39                       dest="seqfileA", | 
|  | 40                       help="input sequences in fasta format") | 
|  | 41     parser.add_option("-b", | 
|  | 42                       "--fasta_file_B", | 
|  | 43                       dest="seqfileB", | 
|  | 44                       help="input sequences in fasta format") | 
|  | 45     parser.add_option("-p", | 
|  | 46                       "--fasta_file_pairs", | 
|  | 47                       dest="seqfile_pairs", | 
|  | 48                       help="output file with paired sequences") | 
|  | 49     parser.add_option("-x", | 
|  | 50                       "--fasta_file_singles", | 
|  | 51                       dest="seqfile_singles", | 
|  | 52                       help="output file with single sequences") | 
|  | 53     options, args = parser.parse_args() | 
|  | 54 | 
|  | 55     # Input files | 
|  | 56     fA = open(options.seqfileA, 'r') | 
|  | 57     fB = open(options.seqfileB, 'r') | 
|  | 58     # Output files | 
|  | 59     if options.seqfile_pairs: | 
|  | 60         fPairs = open(options.seqfile_pairs, 'w') | 
|  | 61     else: | 
|  | 62         fPairs = open(options.seqfileA + ".pairs", 'w') | 
|  | 63 | 
|  | 64     if options.seqfile_singles: | 
|  | 65         single = open(options.seqfile_singles, "w") | 
|  | 66     else: | 
|  | 67         single = open(options.seqfileA + ".single", "w") | 
|  | 68 | 
|  | 69 | 
|  | 70     sA1 = readSingleSeq(fA) | 
|  | 71     sB1 = readSingleSeq(fB) | 
|  | 72     if not sA1 or not sB1: | 
|  | 73         raise Exception("\nEmpty sequence on input, nothing to interlace!\n") | 
|  | 74     charA = sA1['name'][-1] | 
|  | 75     charB = sB1['name'][-1] | 
|  | 76     # validate sequence names | 
|  | 77     if charA == charB: | 
|  | 78         sys.stderr.write( | 
|  | 79             "last character of sequence id must be used for distinguishing pairs!") | 
|  | 80         exit(1) | 
|  | 81         # check first thousand! | 
|  | 82     for i in range(1000): | 
|  | 83         seqA = readSingleSeq(fA) | 
|  | 84         seqB = readSingleSeq(fB) | 
|  | 85         if (not seqA) or (not seqB): | 
|  | 86             # end of file: | 
|  | 87             if i == 0: | 
|  | 88                 sys.stderr.write("input file is empty") | 
|  | 89                 exit(1) | 
|  | 90             else: | 
|  | 91                 break | 
|  | 92         if seqA['name'][-1] == charA and seqB['name'][-1] == charB: | 
|  | 93             continue | 
|  | 94         else: | 
|  | 95             sys.stderr.write( | 
|  | 96                 "last character of sequence id must be used for distinguishing pairs!") | 
|  | 97             exit(1) | 
|  | 98 | 
|  | 99     fA.seek(0) | 
|  | 100     fB.seek(0) | 
|  | 101 | 
|  | 102     buffA = {} | 
|  | 103     buffB = {} | 
|  | 104     buffA_names = [] | 
|  | 105     buffB_names = [] | 
|  | 106 | 
|  | 107     while True: | 
|  | 108 | 
|  | 109         seqA = readSingleSeq(fA) | 
|  | 110         seqB = readSingleSeq(fB) | 
|  | 111 | 
|  | 112         if not seqA and not seqB: | 
|  | 113             break  # end of file | 
|  | 114 | 
|  | 115         ## validation and direct checking only if not end of files | 
|  | 116         if seqA and seqB: | 
|  | 117             #validate: | 
|  | 118             if not (seqA['name'][-1] == charA and seqB['name'][-1] == charB): | 
|  | 119                 sys.stderr.write( | 
|  | 120                     "last character of sequence id must be used for distinguishing pairs!") | 
|  | 121                 exit(1) | 
|  | 122 | 
|  | 123                 # check if current seqs are pairs | 
|  | 124             if seqA['name'][:-1] == seqB['name'][:-1]: | 
|  | 125                 writeSingleSeq(fPairs, seqA) | 
|  | 126                 writeSingleSeq(fPairs, seqB) | 
|  | 127                 continue | 
|  | 128 | 
|  | 129             ### compare whith buffers | 
|  | 130             ### seqA vs buffB | 
|  | 131         if seqA: | 
|  | 132             if seqA["name"][:-1] in buffB: | 
|  | 133                 writeSingleSeq(fPairs, seqA) | 
|  | 134                 seqtmp = {"name": seqA["name"][:-1] + charB, | 
|  | 135                           "sequence": buffB[seqA["name"][:-1]]} | 
|  | 136                 writeSingleSeq(fPairs, seqtmp) | 
|  | 137                 # can I empty buffA ??? | 
|  | 138                 for i in buffA_names: | 
|  | 139                     seqtmp = {"name": i + charA, "sequence": buffA[i]} | 
|  | 140                     writeSingleSeq(single, seqtmp) | 
|  | 141                     buffA = {} | 
|  | 142                     buffA_names = [] | 
|  | 143 | 
|  | 144                 j = 0 | 
|  | 145                 for i in buffB_names: | 
|  | 146                     seqtmp = {"name": i + charB, "sequence": buffB[i]} | 
|  | 147                     del buffB[i] | 
|  | 148                     j += 1 | 
|  | 149                     if i == seqA["name"][:-1]: | 
|  | 150                         del buffB_names[0:j] | 
|  | 151                         break | 
|  | 152                     else: | 
|  | 153                         writeSingleSeq(single, seqtmp) | 
|  | 154             else: | 
|  | 155                 buffA[seqA["name"][:-1]] = seqA['sequence'] | 
|  | 156                 buffA_names.append(seqA["name"][:-1]) | 
|  | 157 | 
|  | 158                 ### seqA vs buffB | 
|  | 159         if seqB: | 
|  | 160             if seqB["name"][:-1] in buffA: | 
|  | 161                 seqtmp = {"name": seqB["name"][:-1] + charA, | 
|  | 162                           "sequence": buffA[seqB["name"][:-1]]} | 
|  | 163                 writeSingleSeq(fPairs, seqtmp) | 
|  | 164                 writeSingleSeq(fPairs, seqB) | 
|  | 165                 # can I empty buffB ??? | 
|  | 166                 for i in buffB_names: | 
|  | 167                     seqtmp = {"name": i + charB, "sequence": buffB[i]} | 
|  | 168                     writeSingleSeq(single, seqtmp) | 
|  | 169                     buffB = {} | 
|  | 170                     buffB_names = [] | 
|  | 171 | 
|  | 172                 j = 0 | 
|  | 173                 for i in buffA_names: | 
|  | 174                     seqtmp = {"name": i + charA, "sequence": buffA[i]} | 
|  | 175                     del buffA[i] | 
|  | 176                     j += 1 | 
|  | 177                     if i == seqB["name"][:-1]: | 
|  | 178                         del buffA_names[0:j] | 
|  | 179                         break | 
|  | 180                     else: | 
|  | 181                         writeSingleSeq(single, seqtmp) | 
|  | 182 | 
|  | 183             else: | 
|  | 184                 buffB[seqB["name"][:-1]] = seqB['sequence'] | 
|  | 185                 buffB_names.append(seqB["name"][:-1]) | 
|  | 186                 fA.close() | 
|  | 187                 fB.close() | 
|  | 188                 fPairs.close() | 
|  | 189                 # write rest of singles: | 
|  | 190     for i in buffA: | 
|  | 191         seqtmp = {"name": i + charA, "sequence": buffA[i]} | 
|  | 192         writeSingleSeq(single, seqtmp) | 
|  | 193     for i in buffB: | 
|  | 194         seqtmp = {"name": i + charB, "sequence": buffB[i]} | 
|  | 195         writeSingleSeq(single, seqtmp) | 
|  | 196         single.close() | 
|  | 197 | 
|  | 198 | 
|  | 199 if __name__ == "__main__": | 
|  | 200     main() |