Mercurial > repos > tduigou > seq_from_db
comparison get_db_info.py @ 2:11a3752feb0a draft default tip
planemo upload for repository https://github.com/brsynth/galaxytools/tree/main/tools commit 7f5d8b62d749a0c41110cd9c04e0254e4fd44893-dirty
| author | tduigou |
|---|---|
| date | Wed, 15 Oct 2025 12:33:41 +0000 |
| parents | 7680420caf9f |
| children |
comparison
equal
deleted
inserted
replaced
| 1:7680420caf9f | 2:11a3752feb0a |
|---|---|
| 91 print("Database not ready, retrying...") | 91 print("Database not ready, retrying...") |
| 92 time.sleep(2) | 92 time.sleep(2) |
| 93 raise Exception("Database connection failed after timeout.") | 93 raise Exception("Database connection failed after timeout.") |
| 94 | 94 |
| 95 | 95 |
| 96 def fetch_annotations(csv_file, sequence_column, annotation_columns, db_uri, table_name, fragment_column_name, output): | 96 def fetch_annotations(csv_file, sequence_column, annotation_columns, db_uri, table_name, fragment_column_name, output, output_report): |
| 97 """Fetch annotations from the database and save the result as GenBank files.""" | 97 """Fetch annotations from the database and save the result as GenBank files.""" |
| 98 db_uri = fix_db_uri(db_uri) | 98 db_uri = fix_db_uri(db_uri) |
| 99 df = pd.read_csv(csv_file, sep=',', header=None) | 99 df = pd.read_csv(csv_file, sep=',', header=None) |
| 100 | 100 |
| 101 engine = create_engine(db_uri) | 101 engine = create_engine(db_uri) |
| 128 if fragment_str not in all_ids: | 128 if fragment_str not in all_ids: |
| 129 csv_fragments.add(fragment_str) | 129 csv_fragments.add(fragment_str) |
| 130 | 130 |
| 131 db_fragments = set(fragment_map.keys()) | 131 db_fragments = set(fragment_map.keys()) |
| 132 missing_fragments = sorted(list(csv_fragments - db_fragments)) | 132 missing_fragments = sorted(list(csv_fragments - db_fragments)) |
| 133 if missing_fragments: | 133 |
| 134 raise ValueError( | 134 # Write report file |
| 135 f" Missing fragments in DB: {', '.join(missing_fragments)}" | 135 with open(output_report, "w") as report_file: |
| 136 ) | 136 if missing_fragments: |
| 137 for frag in missing_fragments: | |
| 138 report_file.write(f"{frag}\n") | |
| 139 else: | |
| 140 report_file.write("") | |
| 137 | 141 |
| 138 # === CONTINUE WITH GB FILE CREATION === | 142 # === CONTINUE WITH GB FILE CREATION === |
| 139 for _, row in df.iterrows(): | 143 for _, row in df.iterrows(): |
| 140 annotated_row = {"Backbone": row[0], "Fragments": []} | 144 annotated_row = {"Backbone": row[0], "Fragments": []} |
| 141 for col in df.columns: | 145 for col in df.columns: |
| 162 | 166 |
| 163 # GenBank file generation per fragment | 167 # GenBank file generation per fragment |
| 164 try: | 168 try: |
| 165 for annotated_row in annotated_data: | 169 for annotated_row in annotated_data: |
| 166 backbone_id = annotated_row["Backbone"] | 170 backbone_id = annotated_row["Backbone"] |
| 171 | |
| 167 for fragment in annotated_row["Fragments"]: | 172 for fragment in annotated_row["Fragments"]: |
| 168 fragment_id = fragment["id"] | 173 fragment_id = fragment["id"] |
| 174 | |
| 175 # Skip generation for missing fragments | |
| 176 if fragment_id in missing_fragments: | |
| 177 continue | |
| 178 | |
| 169 sequence = fragment.get(sequence_column, "") | 179 sequence = fragment.get(sequence_column, "") |
| 170 annotation = fragment.get(annotation_columns, "") | 180 annotation = fragment.get(annotation_columns, "") |
| 171 | 181 |
| 172 # Create the SeqRecord | 182 # Create the SeqRecord |
| 173 record = SeqRecord( | 183 record = SeqRecord( |
| 180 # Add annotations to GenBank header | 190 # Add annotations to GenBank header |
| 181 record.annotations = { | 191 record.annotations = { |
| 182 k: str(fragment[k]) for k in annotation_columns if k in fragment | 192 k: str(fragment[k]) for k in annotation_columns if k in fragment |
| 183 } | 193 } |
| 184 | 194 |
| 185 # LOCUS line extraction from annotation (copy-paste the LOCUS from annotation) | 195 # LOCUS line extraction from annotation |
| 186 locus_line_match = re.search(r"LOCUS\s+.+", annotation) | 196 locus_line_match = re.search(r"LOCUS\s+.+", annotation) |
| 187 if locus_line_match: | 197 if locus_line_match: |
| 188 locus_line = locus_line_match.group() | 198 locus_line = locus_line_match.group() |
| 189 else: | 199 else: |
| 190 print(f"LOCUS info missing for fragment {fragment_id}") | 200 print(f"LOCUS info missing for fragment {fragment_id}") |
| 191 locus_line = f"LOCUS {fragment_id: <20} {len(sequence)} bp DNA linear UNK 01-JAN-2025" | 201 locus_line = f"LOCUS {fragment_id: <20} {len(sequence)} bp DNA linear UNK 01-JAN-2025" |
| 192 | 202 |
| 193 # Format sequence as per GenBank standards (with ORIGIN and line breaks) | 203 # Format sequence |
| 194 if "ORIGIN" in sequence: | 204 if "ORIGIN" in sequence: |
| 195 origin_block = sequence.strip() | 205 origin_block = sequence.strip() |
| 196 else: | 206 else: |
| 197 # Format sequence as per GenBank standards (with ORIGIN and line breaks) | |
| 198 formatted_sequence = "ORIGIN\n" | 207 formatted_sequence = "ORIGIN\n" |
| 199 seq_str = str(record.seq) | 208 seq_str = str(record.seq) |
| 200 for i in range(0, len(seq_str), 60): # 60 bases per line | 209 for i in range(0, len(seq_str), 60): |
| 201 line_seq = seq_str[i:i + 60] | 210 line_seq = seq_str[i:i + 60] |
| 202 formatted_sequence += f"{str(i + 1).rjust(9)} { ' '.join([line_seq[j:j+10] for j in range(0, len(line_seq), 10)]) }\n" | 211 formatted_sequence += f"{str(i + 1).rjust(9)} { ' '.join([line_seq[j:j+10] for j in range(0, len(line_seq), 10)]) }\n" |
| 203 origin_block = formatted_sequence.strip() | 212 origin_block = formatted_sequence.strip() |
| 204 | 213 |
| 205 # Find and copy the FEATURES section directly from annotation | 214 # Extract FEATURES section |
| 206 features_section = "" | 215 features_section = "" |
| 207 features_start = annotation.find("FEATURES") | 216 features_start = annotation.find("FEATURES") |
| 208 if features_start != -1: | 217 if features_start != -1: |
| 209 features_section = annotation[features_start:] | 218 features_section = annotation[features_start:] |
| 210 | 219 |
| 211 # Writing the GenBank file | 220 # Write GenBank file |
| 212 if not os.path.exists(output): | 221 if not os.path.exists(output): |
| 213 os.makedirs(output) | 222 os.makedirs(output) |
| 214 | 223 |
| 215 gb_filename = os.path.join(output, f"{fragment_id}.gb") | 224 gb_filename = os.path.join(output, f"{fragment_id}.gb") |
| 216 with open(gb_filename, "w") as f: | 225 with open(gb_filename, "w") as f: |
| 217 # Write the LOCUS line | |
| 218 f.write(locus_line + "\n") | 226 f.write(locus_line + "\n") |
| 219 # Write DEFINITION, ACCESSION, and other annotations | |
| 220 f.write(f"DEFINITION {record.description}\n") | 227 f.write(f"DEFINITION {record.description}\n") |
| 221 f.write(f"ACCESSION {record.id}\n") | 228 f.write(f"ACCESSION {record.id}\n") |
| 222 f.write(f"VERSION DB\n") | 229 f.write(f"VERSION DB\n") |
| 223 f.write(f"KEYWORDS .\n") | 230 f.write(f"KEYWORDS .\n") |
| 224 f.write(f"SOURCE .\n") | 231 f.write(f"SOURCE .\n") |
| 225 # Write the FEATURES section directly from annotation | |
| 226 f.write(features_section) | 232 f.write(features_section) |
| 227 # Write the ORIGIN section | |
| 228 f.write(origin_block + "\n") | 233 f.write(origin_block + "\n") |
| 229 f.write("//\n") | 234 f.write("//\n") |
| 230 | 235 |
| 231 except Exception as e: | 236 except Exception as e: |
| 232 print(f"Error saving GenBank files: {e}") | 237 print(f"Error saving GenBank files: {e}") |
| 242 parser.add_argument("--db_uri", required=False, help="Database URI connection string") | 247 parser.add_argument("--db_uri", required=False, help="Database URI connection string") |
| 243 parser.add_argument("--table", required=False, help="Table name in the database") | 248 parser.add_argument("--table", required=False, help="Table name in the database") |
| 244 parser.add_argument("--fragment_column", required=False, help="Fragment column name in the database") | 249 parser.add_argument("--fragment_column", required=False, help="Fragment column name in the database") |
| 245 parser.add_argument("--output", required=True, help="Output dir for gb files") | 250 parser.add_argument("--output", required=True, help="Output dir for gb files") |
| 246 parser.add_argument("--json_conf", required=False, help="JSON config file with DB parameters") | 251 parser.add_argument("--json_conf", required=False, help="JSON config file with DB parameters") |
| 252 parser.add_argument("--report", required=True, help="Output report for fragments checking in DB") | |
| 247 args = parser.parse_args() | 253 args = parser.parse_args() |
| 248 | 254 |
| 249 # get param and chek for json | 255 # get param and chek for json |
| 250 config_params = {} | 256 config_params = {} |
| 251 use_json = args.use_json_paramers == 'true' | 257 use_json = args.use_json_paramers == 'true' |
| 285 raise e | 291 raise e |
| 286 else: | 292 else: |
| 287 time.sleep(2) | 293 time.sleep(2) |
| 288 | 294 |
| 289 # Fetch annotations from the database and save as gb | 295 # Fetch annotations from the database and save as gb |
| 290 fetch_annotations(args.input, sequence_column, annotation_column, db_uri, table, fragment_column, args.output) | 296 fetch_annotations(args.input, sequence_column, annotation_column, db_uri, table, fragment_column, args.output, args.report) |
| 291 | 297 |
| 292 if __name__ == "__main__": | 298 if __name__ == "__main__": |
| 293 main() | 299 main() |
