Mercurial > repos > tduigou > save_to_db
changeset 0:7ff266aecf01 draft default tip
planemo upload for repository https://github.com/brsynth/galaxytools/tree/main/tools commit 3401816c949b538bd9c67e61cbe92badff6a4007-dirty
| author | tduigou | 
|---|---|
| date | Wed, 11 Jun 2025 09:42:24 +0000 | 
| parents | |
| children | |
| files | save_to_db.py seq_to_db.xml test-data/HC_Amp_ccdB.gb test-data/p4_Kt-L7Ae-Weiss.gb test-data/p6_Kozak-ATG.gb test-data/p6_Nt-IgKLsequence.gb test-data/p7_L7Ae-Weiss.gb test-data/test-JSON_arg.json test-data/test-JSON_arg_block.json test-data/test_raport.txt verification.py | 
| diffstat | 11 files changed, 879 insertions(+), 0 deletions(-) [+] | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/save_to_db.py Wed Jun 11 09:42:24 2025 +0000 @@ -0,0 +1,266 @@ +import subprocess +import argparse +import time +import os +import socket +import re +import json +from sqlalchemy import create_engine, inspect +from sqlalchemy.engine.url import make_url +from sqlalchemy.sql import text +from sqlalchemy.exc import OperationalError + + +def resolve_parameters(user_params: dict, json_params: dict, keys: list): + resolved = {} + for key in keys: + # Prefer user parameter if it's provided (not None or empty string) + if key in user_params and user_params[key]: + resolved[key] = user_params[key] + else: + resolved[key] = json_params.get(f"JSON_{key}") + return resolved + + +def fix_db_uri(uri): + """Replace __at__ with @ in the URI if needed.""" + return uri.replace("__at__", "@") + + +def is_port_in_use(uri): + """Check if a TCP port is already in use on host.""" + url = make_url(uri) + host = url.host + port = url.port + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.settimeout(2) + return s.connect_ex((host, port)) == 0 + +def extract_db_name(uri): + """Extract the database name from the SQLAlchemy URI.""" + url = make_url(uri) + return url.database + + +# this fuction is to activate the Docker id the DB is in container. BUT IT IS NOT USED IN MAIN() +def start_postgres_container(db_name): + """Start a PostgreSQL container with the given database name as the container name.""" + container_name = db_name + + # Check if container is already running + container_running = subprocess.run( + f"docker ps -q -f name={container_name}", shell=True, capture_output=True, text=True + ) + + if container_running.stdout.strip(): + print(f"Container '{container_name}' is already running.") + return + + # Check if container exists (stopped) + container_exists = subprocess.run( + f"docker ps -a -q -f name={container_name}", shell=True, capture_output=True, text=True + ) + + if container_exists.stdout.strip(): + print(f"Starting existing container '{container_name}'...") + subprocess.run(f"docker start {container_name}", shell=True) + print(f"PostgreSQL Docker container '{container_name}' activated.") + return + + # If container does not exist, create and start a new one + port = 5432 if not is_port_in_use(5432) else 5433 + postgres_password = os.getenv("POSTGRES_PASSWORD", "RK17") + + start_command = [ + "docker", "run", "--name", container_name, + "-e", f"POSTGRES_PASSWORD={postgres_password}", + "-p", f"{port}:5432", + "-d", "postgres" + ] + + try: + subprocess.run(start_command, check=True) + print(f"PostgreSQL Docker container '{container_name}' started on port {port}.") + except subprocess.CalledProcessError as e: + print(f"Failed to start Docker container: {e}") + + +def wait_for_db(uri, timeout=60): + """Try connecting to the DB until it works or timeout.""" + engine = create_engine(uri) + start_time = time.time() + while time.time() - start_time < timeout: + try: + with engine.connect(): + print("Connected to database.") + return + except OperationalError: + print("Database not ready, retrying...") + time.sleep(2) + raise Exception("Database connection failed after timeout.") + + +def push_gb_annotations(gb_files, sequence_column, annotation_column, db_uri, table_name, fragment_column_name, output, file_name_mapping): + """Push GenBank file content into the database if the fragment is not already present.""" + db_uri = fix_db_uri(db_uri) + engine = create_engine(db_uri) + inserted_fragments = [] + + try: + # Parse the file_name_mapping string into a dictionary {base_file_name: fragment_name} + file_name_mapping_dict = { + os.path.basename(path): os.path.splitext(fragment_name)[0] + for mapping in file_name_mapping.split(",") + for path, fragment_name in [mapping.split(":")] + } + + #print("File name mapping dictionary:") + #print(file_name_mapping_dict) # Debugging: Print the mapping dictionary + + with engine.begin() as connection: + inspector = inspect(engine) + columns = [col['name'] for col in inspector.get_columns(table_name)] + + if fragment_column_name not in columns: + raise ValueError(f"Fragment column '{fragment_column_name}' not found in table '{table_name}'.") + + # Get existing fragments + all_rows = connection.execute(text(f"SELECT {fragment_column_name} FROM {table_name}")).fetchall() + existing_fragments = {row[0] for row in all_rows} + + insert_rows = [] + + for gb_file in gb_files: + # Extract base file name (just the file name, not the full path) + real_file_name = os.path.basename(gb_file) + fragment_name = file_name_mapping_dict.get(real_file_name) + + print(f"Processing file: {real_file_name}({fragment_name})") # Debugging: Log the current file + + # Get the corresponding fragment name from the mapping + fragment_name = file_name_mapping_dict.get(real_file_name) + + if not fragment_name: + raise ValueError(f"Fragment name not found for file '{real_file_name}' in file_name_mapping.") + + # If the fragment is already in the DB, raise an error and stop the process + if fragment_name in existing_fragments: + raise RuntimeError(f"Fatal Error: Fragment '{fragment_name}' already exists in DB. Stopping the process.") + + with open(gb_file, "r") as f: + content = f.read() + + origin_match = re.search(r"^ORIGIN.*$", content, flags=re.MULTILINE) + if not origin_match: + raise ValueError(f"ORIGIN section not found in file: {gb_file}") + + origin_start = origin_match.start() + annotation_text = content[:origin_start].strip() + sequence_text = content[origin_start:].strip() + + values = {} + values[fragment_column_name] = fragment_name + values[annotation_column] = annotation_text + values[sequence_column] = sequence_text + + insert_rows.append(values) + inserted_fragments.append(fragment_name) + + # Insert the rows into the database + for values in insert_rows: + col_names = ", ".join(values.keys()) + placeholders = ", ".join([f":{key}" for key in values.keys()]) + insert_stmt = text(f"INSERT INTO {table_name} ({col_names}) VALUES ({placeholders})") + + # print(f"Inserting into DB: {values}") # Debugging print statement + connection.execute(insert_stmt, values) + + # print(f"Insert result: {result.rowcount if hasattr(result, 'rowcount') else 'N/A'}") # Debugging the row count + + print(f"Inserted {len(insert_rows)} fragments.") + + # Write inserted fragment names to a text file + with open(output, "w") as log_file: + for frag in inserted_fragments: + log_file.write(f"{frag}\n") + print(f"Fragment names written to '{output}'.") + + except Exception as e: + print(f"Error during GB file insertion: {e}") + raise + + +def main(): + parser = argparse.ArgumentParser(description="Fetch annotations from PostgreSQL database and save as JSON.") + parser.add_argument("--input", required=True, help="Input gb files") + parser.add_argument("--sequence_column", required=True, help="DB column contains sequence for ganbank file") + parser.add_argument("--annotation_column", required=True, help="DB column contains head for ganbank file") + parser.add_argument("--db_uri", required=True, help="Database URI connection string") + parser.add_argument("--table", required=True, help="Table name in the database") + parser.add_argument("--fragment_column", required=True, help="Fragment column name in the database") + parser.add_argument("--output", required=True, help="Text report") + parser.add_argument("--file_name_mapping", required=True, help="real fragments names") + parser.add_argument("--json_conf", required=False, help="JSON config file with DB parameters") + args = parser.parse_args() + + # Load JSON config if provided + json_config = {} + if args.json_conf != 'None' or '': + with open(args.json_conf, "r") as f: + json_config = json.load(f) + if "execution" in json_config and json_config["execution"] == "false": + print("Execution was blocked by config (execution = false)") + return + + # Prefer user input; fallback to JSON_ values if not provided + user_params = { + "table": args.table, + "sequence_column": args.sequence_column, + "annotation_column": args.annotation_column, + "fragment_column": args.fragment_column, + "db_uri": args.db_uri + } + + keys = ["table", "sequence_column", "annotation_column", "fragment_column", "db_uri"] + resolved = resolve_parameters(user_params, json_config, keys) + + # Unpack resolved parameters + table = resolved["table"] + sequence_column = resolved["sequence_column"] + annotation_column = resolved["annotation_column"] + fragment_column = resolved["fragment_column"] + db_uri = fix_db_uri(resolved["db_uri"]) + + # Prepare gb files + gb_file_list = [f.strip() for f in args.input.split(",") if f.strip()] + + # Start and wait for DB + # db_name = extract_db_name(db_uri) + # start_postgres_container(db_name) + MAX_RETRIES = 3 + for attempt in range(1, MAX_RETRIES + 1): + try: + wait_for_db(db_uri) + break # Success + except Exception as e: + if attempt == MAX_RETRIES: + print(f"Attempt {attempt} failed: Could not connect to database at {db_uri}.") + raise e + else: + time.sleep(2) + + # Push annotations + push_gb_annotations( + gb_file_list, + sequence_column, + annotation_column, + db_uri, + table, + fragment_column, + args.output, + args.file_name_mapping + ) + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/seq_to_db.xml Wed Jun 11 09:42:24 2025 +0000 @@ -0,0 +1,148 @@ +<tool id="seq_to_db" name="Save Sequence Data In DB" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="21.09"> + <description>Save fragment's sequence in an accessible database and import it from .gb files</description> + <macros> + <token name="@VERSION_SUFFIX@">0</token> + <token name="@TOOL_VERSION@">0.1.0</token> + </macros> + <requirements> + <requirement type="package" version="2.2.3">pandas</requirement> + <requirement type="package" version="2.0.40">sqlalchemy</requirement> + <requirement type="package" version="2.9.9">psycopg2</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + #set genbank_file_paths = ','.join([str(f) for f in $genbank_files]) + #set $file_name_mapping = ",".join(["%s:%s" % (file.file_name, file.name) for file in $genbank_files]) + python '$__tool_directory__/save_to_db.py' + --input '$genbank_file_paths' + --sequence_column '$sequence_column' + --annotation_column '$annotation_column' + --db_uri '$db_uri' + --table '$table' + --fragment_column '$fragment_column' + --output '$output' + --file_name_mapping '$file_name_mapping' + --json_conf '$json_conf' + ]]></command> + <inputs> + <param name="genbank_files" type="data_collection" collection_type="list" format="genbank" label="GenBank File(s)"/> + <param name="table" type="text" label="DB Table Name" optional="true" help="It can be extracted from JSON file -key:'JSON_table'-" /> + <param name="sequence_column" type="text" label="DB Column Contains Sequence For ganbank File" optional="true" help="It can be extracted from JSON file -key:'JSON_sequence_column'-" /> + <param name="annotation_column" type="text" label="DB Column Contains Annotation For Ganbank File" optional="true" help="It can be extracted from JSON file -key:'JSON_annotation_column'-" /> + <param name="fragment_column" type="text" label="DB IDs Column Name" optional="true" help="It can be extracted from JSON file -key:'JSON_fragment_column'-" /> + <param name="db_uri" type="text" label="DB Connection URI" help="postgresql://container_name:password@host:port/path/to/database (It can be extracted from JSON file -key:'JSON_db_uri'-)" optional="true" /> + <section name='adv' title='Advance' expanded='false'> + <param name="json_conf" type="data" format='json' label="DB config as a json file" help="JSON file specifying the database URI, table name and the column names for annotation and sequence data" optional="true" /> + </section> + </inputs> + <outputs> + <data name="output" format="txt" label="saving report" /> + </outputs> + <tests> + <!--Only 1 test can be execute because the fragment will be already saved for the second test and it will return error as the fragments are present in the DB (execut ../get_db_data/testMock.py to regenerate initial DB)--> + <!--test tool blocking from JSON. It is commented because it should faild as it is a test to validate that the tool can be blocked from json file --> + <test> + <param name="genbank_files"> + <collection type="list"> + <element name="p7_L7Ae-Weiss" value="p7_L7Ae-Weiss.gb" /> + <element name="p7_gfp_sequence" value="p6_Nt-IgKLsequence.gb" /> + <element name="p14_CMVp" value="p6_Kozak-ATG.gb" /> + <element name="p16_bGHpolyA" value="p4_Kt-L7Ae-Weiss.gb" /> + <element name="p18_CMVp" value="HC_Amp_ccdB.gb" /> + </collection> + </param> + <param name="adv|json_conf" value="test-JSON_arg_block.json" /> + <output name="output" file="test_raport.txt" ftype="txt" > + <assert_contents> + <has_n_lines n="5" /> + <has_line_matching expression="p7_L7Ae-Weiss" /> + <has_line_matching expression="p6_Nt-IgKLsequence" /> + <has_line_matching expression="p6_Kozak-ATG" /> + <has_line_matching expression="p4_Kt-L7Ae-Weiss" /> + <has_line_matching expression="HC_Amp_ccdB" /> + </assert_contents> + </output> + </test> + <!--test DB config in the tool --> + <test> + <param name="genbank_files"> + <collection type="list"> + <element name="p7_L7Ae-Weiss" value="p7_L7Ae-Weiss.gb" /> + <element name="p7_gfp_sequence" value="p6_Nt-IgKLsequence.gb" /> + <element name="p14_CMVp" value="p6_Kozak-ATG.gb" /> + <element name="p16_bGHpolyA" value="p4_Kt-L7Ae-Weiss.gb" /> + <element name="p18_CMVp" value="HC_Amp_ccdB.gb" /> + </collection> + </param> + <param name="table" value="sample" /> + <param name="sequence_column" value="sequence" /> + <param name="annotation_column" value="annotation" /> + <param name="fragment_column" value="fragment" /> + <param name="db_uri" value="postgresql://postgres:RK17@localhost:5432/test_fragments_db" /> + <output name="output" file="test_raport.txt" ftype="txt" > + <assert_contents> + <has_n_lines n="5" /> + <has_line_matching expression="p7_L7Ae-Weiss" /> + <has_line_matching expression="p6_Nt-IgKLsequence" /> + <has_line_matching expression="p6_Kozak-ATG" /> + <has_line_matching expression="p4_Kt-L7Ae-Weiss" /> + <has_line_matching expression="HC_Amp_ccdB" /> + </assert_contents> + </output> + </test> + <!--test DB config from JSON. It is commented because the save can be done only on time then the fragment willl be in the DB and it will return a failure. to run the test comment the test above (one saving test in the run)--> + <test> + <param name="genbank_files"> + <collection type="list"> + <element name="p7_L7Ae-Weiss" value="p7_L7Ae-Weiss.gb" /> + <element name="p7_gfp_sequence" value="p6_Nt-IgKLsequence.gb" /> + <element name="p14_CMVp" value="p6_Kozak-ATG.gb" /> + <element name="p16_bGHpolyA" value="p4_Kt-L7Ae-Weiss.gb" /> + <element name="p18_CMVp" value="HC_Amp_ccdB.gb" /> + </collection> + </param> + <param name="adv|json_conf" value="test-JSON_arg.json" /> + <output name="output" file="test_raport.txt" ftype="txt" > + <assert_contents> + <has_n_lines n="5" /> + <has_line_matching expression="p7_L7Ae-Weiss" /> + <has_line_matching expression="p6_Nt-IgKLsequence" /> + <has_line_matching expression="p6_Kozak-ATG" /> + <has_line_matching expression="p4_Kt-L7Ae-Weiss" /> + <has_line_matching expression="HC_Amp_ccdB" /> + </assert_contents> + </output> + </test> + </tests> + + <help><![CDATA[ +Save Sequence Data In DB +======================== + +Implemented a system to save GenBank (.gb) files in an accessible DB, based on a connection via URI requests. + +**Parameters**: +--------------- +* **GenBank File(s)**: List of GenBaks files. +* **DB Table Name**: Name of the target table in the PostgreSQL database. +* **DB Column Contains Sequence For ganbank File**: Column storing sequence data, expected to start with "ORIGIN". +* **DB Column Contains Annotation For Ganbank File**: Column containing annotation data, to save al part before "ORIGIN" in the .gb file. +* **DB IDs Column Name**: Column holding the unique fragment IDs. +* **DB Connection URI**: URI used to connect to the database (e.g., postgresql://postgres:pass@localhost:5432/test_fragments_db). +* **DB config as a json file**: JSON file contains the DB configuration: + - "JSON_table": will be the key to the table name. + - "JSON_sequence_column": will be the key to the sequence column. + - "JSON_annotation_column": will be the key to the annotation column. + - "JSON_fragment_column": will be the key to the fragment column. + - "JSON_db_uri": will be the key to the URI. + - "execution": It is the key to execute or block the tool during a workflow ("True" or "False"). + ]]></help> + <citations> + <citation type="bibtex"> + @unpublished{seq_to_db + author = {Ramiz Khaled}, + title = {{seq_to_db}}, + url = {https://github.com/brsynth/}, + } + </citation> + </citations> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/HC_Amp_ccdB.gb Wed Jun 11 09:42:24 2025 +0000 @@ -0,0 +1,109 @@ +LOCUS Exported 2721 bp ds-DNA circular SYN 14-SEP-2017 +DEFINITION synthetic circular DNA. +ACCESSION . +VERSION . +KEYWORDS HC_Amp_ccdB(1-25).gb +SOURCE synthetic DNA construct + ORGANISM synthetic DNA construct +REFERENCE 1 (bases 1 to 2721) + AUTHORS Trial User + TITLE Direct Submission + JOURNAL Exported Sep 14, 2017 from SnapGene Viewer 4.0.2 + http://www.snapgene.com +FEATURES Location/Qualifiers + source 1..2721 + /organism="synthetic DNA construct" + /mol_type="other DNA" + misc_feature complement(73..78) + /label=BsmBI + misc_feature 129..462 + /label=*ccdB promoter* + exon 463..768 + /label=ccdB + /note="ccdB" + terminator 809..880 + /note="rrnB T1 terminator + transcription terminator T1 from the E. coli rrnB gene" + terminator 896..923 + /note="T7Te terminator + phage T7 early transcription terminator" + misc_feature 930..942 + /label=BioBrick suffix + /note="universal suffix for all parts" + misc_feature 943..948 + /label=BsmBI + terminator 1027..1056 + /note="T3Te terminator + phage T3 early transcription terminator" + rep_origin 1078..1665 + /direction=RIGHT + /label=ori + /note="high-copy-number ColE1/pMB1/pBR322/pUC origin of + replication" + terminator 1677..1704 + /note="T7Te terminator + phage T7 early transcription terminator" + CDS complement(1728..2588) + /codon_start=1 + /gene="bla" + /product="beta-lactamase" + /note="AmpR + confers resistance to ampicillin, carbenicillin, and + related antibiotics" + /translation="MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYI + ELDLNSGKILESLRPEERFPMMSTFKVLLCGAVLSRIDAGQEQLGRRIHYSQNDLVEYS + PVTEKHLTDGMTVRELCSAAITMSDNTAANLLLATIGGPKELTAFLHNMGDHVTRLDRW + EPELNEAIPNDERDTTMPVAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGSLLRSA + LPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGAS + LIKHW" + promoter complement(2589..2691) + /note="cat promoter + promoter of the E. coli cat gene" +ORIGIN + 1 ctttctgcta tggaggtcag gtatgattta aatggtcagt attgagcgat atctagagaa + 61 ttcgtcatag gagagacgca atacgcaaac cgcctctccc cgcgcgttgg ccgattcatt + 121 aatgcaggga tccggcttac taaaagccag ataacagtat gcgtatttgc gcgctgattt + 181 ttgcggtata agaatatata ctgatatgta tacccgaagt atgtcaaaaa gaggtatgct + 241 atgaagcagc gtattacagt gacagttgac agcgacagct atcagttgct caaggcatat + 301 atgatgtcaa tatctccggt ctggtaagca caaccatgca gaatgaagcc cgtcgtctgc + 361 gtgccgaacg ctggaaagcg gaaaatcagg aagggatggc tgaggtcgcc cggtttattg + 421 aaatgaacgg ctcttttgct gacgagaaca ggggctggtg aaatgcagtt taaggtttac + 481 acctataaaa gagagagccg ttatcgtctg tttgtggatg tacagagtga tattattgac + 541 acgcccgggc gacggatggt gatccccctg gccagtgcac gtctgctgtc agataaagtc + 601 ccccgtgaac tttacccggt ggtgcatatc ggggatgaaa gctggcgcat gatgaccacc + 661 gatatggcca gtgtgccggt ctccgttatc ggggaagaag tggctgatct cagccaccgc + 721 gaaaatgaca tcaaaaacgc cattaacctg atgttctggg gaatataagc tgatagtgct + 781 agtgtagatc gctactagag ccaggcatca aataaaacga aaggctcagt cgaaagactg + 841 ggcctttcgt tttatctgtt gtttgtcggt gaacgctctc tactagagtc acactggctc + 901 accttcgggt gggcctttct gcgtttatat actagtagcg gccgtctcaa cgataacgaa + 961 ttcaagcttg atatcattca ggacgagcct cagactccag cgtaactgga ctgcaatcaa + 1021 ctcactggct caccttcacg ggtgggcctt tcttcggtag aaaatcaaag gatcttcttg + 1081 agatcctttt tttctgcgcg taatctgctg cttgcaaaca aaaaaaccac cgctaccagc + 1141 ggtggtttgt ttgccggatc aagagctacc aactcttttt ccgaggtaac tggcttcagc + 1201 agagcgcaga taccaaatac tgttcttcta gtgtagccgt agttaggcca ccacttcaag + 1261 aactctgtag caccgcctac atacctcgct ctgctaatcc tgttaccagt ggctgctgcc + 1321 agtggcgata agtcgtgtct taccgggttg gactcaagac gatagttacc ggataaggcg + 1381 cagcggtcgg gctgaacggg gggttcgtgc acacagccca gcttggagcg aacgacctac + 1441 accgaactga gatacctaca gcgtgagcta tgagaaagcg ccacgcttcc cgaagggaga + 1501 aaggcggaca ggtatccggt aagcggcagg gtcggaacag gagagcgcac gagggagctt + 1561 ccagggggaa acgcctggta tctttatagt cctgtcgggt ttcgccacct ctgacttgag + 1621 catcgatttt tgtgatgctc gtcagggggg cggagcctat ggaaaaacgc cagcaacgca + 1681 gaaaggccca cccgaaggtg agccaggtga ttacatttgg gccctcatta ccaatgctta + 1741 atcagtgagg cacctatctc agcgatctgt ctatttcgtt catccatagt tgcctgactc + 1801 cccgtcgtgt agataactac gatgcgggag ggcttaccat ctggccccag tgctgcaatg + 1861 ataccgcgag aaccacgctc accggctcca gatttatcag caataaacca gccagccggg + 1921 agggccgagc gcagaagtga tcctgcaact ttatccgcct ccatccagtc tattaattgt + 1981 tgccgggaag ctagagtaag tagttcgcca gttaatagtt tgcgcaacgt tgttgccatt + 2041 gctacaggca tcgtggtgtc acgctcgtcg tttggtatgg cttcattcag ctccggttcc + 2101 caacgatcaa ggcgagttac atgatccccc atgttgtgca aaaaagcggt tagctccttc + 2161 ggtcctccga tcgttgccag aagtaagttg gccgcagtgt tatcactcat ggttatggca + 2221 gcactgcata attctcttac tgtcatgcca tccgtgagat gcttttctgt gactggtgag + 2281 tactcaacca agtcattctg agaatagtgt atgcggcgac cgagttgctc ttgcccggcg + 2341 tcaatacggg ataataccgc gccacatagc agaactttaa aagtgctcat cattggaaaa + 2401 cgttcttcgg ggcgtaaact ctcaaggatc ttaccgctgt tgagatccag ttcgatgtaa + 2461 cccactcgtg cacccaactg atcttcagca tcttttactt tcaccagcgt ttctgggtga + 2521 gcaaaaacag gaaggcaaaa tgccgcaaaa aagggaataa gggcgacacg gaaatgttga + 2581 atactcattt tagcttcctt agctcctgaa aatctcgata actcaaaaaa tacgcccggt + 2641 agtgatctta tttcattatg gtgaaagttg gaacctctta cgtgccgatc aagtcaaaag + 2701 cctccggtcg gaggcttttg a +//
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/p4_Kt-L7Ae-Weiss.gb Wed Jun 11 09:42:24 2025 +0000 @@ -0,0 +1,72 @@ +LOCUS . 1845 bp DNA UNK 01-JAN-1980 +DEFINITION . +ACCESSION <unknown id> +VERSION <unknown id> +KEYWORDS . +SOURCE . + ORGANISM . + . +FEATURES Location/Qualifiers + terminator 392..419 + /note="T7Te terminator" + /note="phage T7 early transcription terminator" + rep_origin complement(431..1018) + /direction=LEFT + /note="ori" + /note="high-copy-number ColE1/pMB1/pBR322/pUC origin of + replication" + terminator 1040..1069 + /note="T3Te terminator" + /note="phage T3 early transcription terminator" + misc_feature 1143..1148 + /note="BsmBI" + source 5..1149 + /source="Exported" + misc_feature 1156..1182 + /note="Kt - L7Ae" + source 1154..1185 + /source="Exported" + misc_feature complement(1191..1196) + /note="BsmBI" + terminator 1263..1294 + /note="tonB terminator" + /note="bidirectional E. coli tonB-P14 transcription + terminator" + promoter 1295..1397 + /note="cat promoter" + /note="promoter of the E. coli cat gene" + source 1190..1845 + /source="Exported" +ORIGIN + 1 ctcaggcgca atcacgaatg aataacggtt tggttggtgc gagtgatttt gatgacgagc + 61 gtaatggctg gcctgttgaa caagtctgga aagaaatgca taagcttttg ccattctcac + 121 cggattcagt cgtcactcat ggtgatttct cacttgataa ccttattttt gacgagggga + 181 aattaatagg ttgtattgat gttggacgag tcggaatcgc agaccgatac caggatcttg + 241 ccatcctatg gaactgcctc ggtgagtttt ctccttcatt acagaaacgg ctttttcaaa + 301 aatatggtat tgataatcct gatatgaata aattgcagtt tcacttgatg ctcgatgagt + 361 ttttctaatg agggcccaaa tgtaatcacc tggctcacct tcgggtgggc ctttctgcgt + 421 tgctggcgtt tttccatagg ctccgccccc ctgacgagca tcacaaaaat cgatgctcaa + 481 gtcagaggtg gcgaaacccg acaggactat aaagatacca ggcgtttccc cctggaagct + 541 ccctcgtgcg ctctcctgtt ccgaccctgc cgcttaccgg atacctgtcc gcctttctcc + 601 cttcgggaag cgtggcgctt tctcatagct cacgctgtag gtatctcagt tcggtgtagg + 661 tcgttcgctc caagctgggc tgtgtgcacg aaccccccgt tcagcccgac cgctgcgcct + 721 tatccggtaa ctatcgtctt gagtccaacc cggtaagaca cgacttatcg ccactggcag + 781 cagccactgg taacaggatt agcagagcga ggtatgtagg cggtgctaca gagttcttga + 841 agtggtggcc taactacggc tacactagaa gaacagtatt tggtatctgc gctctgctga + 901 agccagttac ctcggaaaaa gagttggtag ctcttgatcc ggcaaacaaa ccaccgctgg + 961 tagcggtggt ttttttgttt gcaagcagca gattacgcgc agaaaaaaag gatctcaaga + 1021 agatcctttg attttctacc gaagaaaggc ccacccgtga aggtgagcca gtgagttgat + 1081 tgcagtccag ttacgctgga gtctgaggct cgtcctgaat gatatcaagc ttgaattcgt + 1141 tacgtctcgg gacaaggatc cgtgatcgga aacgtgagat ccagttccgc gagacgaaga + 1201 cgaattctct agatatcgct caatactgac catttaaatc atacctgacc tccatagcag + 1261 aaagtcaaaa gcctccgacc ggaggctttt gacttgatcg gcacgtaaga ggttccaact + 1321 ttcaccataa tgaaataaga tcactaccgg gcgtattttt tgagttatcg agattttcag + 1381 gagctaagga agctaaaatg agccatattc aacgggaaac gtcttgctcg aggccgcgat + 1441 taaattccaa catggatgct gatttatatg ggtataaatg ggctcgcgat aatgtcgggc + 1501 aatcaggtgc gacaatctat cgattgtatg ggaagcccga tgcgccagag ttgtttctga + 1561 aacatggcaa aggtagcgtt gccaatgatg ttacagatga gatggtcagg ctaaactggc + 1621 tgacggaatt tatgcctctt ccgaccatca agcattttat ccgtactcct gatgatgcat + 1681 ggttactcac cactgcgatc ccagggaaaa cagcattcca ggtattagaa gaatatcctg + 1741 attcaggtga aaatattgtt gatgcgctgg cagtgttcct gcgccggttg cattcgattc + 1801 ctgtttgtaa ttgtcctttt aacggcgatc gcgtatttcg tctcg +//
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/p6_Kozak-ATG.gb Wed Jun 11 09:42:24 2025 +0000 @@ -0,0 +1,76 @@ +LOCUS . 1856 bp DNA UNK 01-JAN-1980 +DEFINITION . +ACCESSION <unknown id> +VERSION <unknown id> +KEYWORDS . +SOURCE . + ORGANISM . + . +FEATURES Location/Qualifiers + terminator 392..419 + /note="T7Te terminator" + /note="phage T7 early transcription terminator" + rep_origin complement(431..1018) + /direction=LEFT + /note="ori" + /note="high-copy-number ColE1/pMB1/pBR322/pUC origin of + replication" + terminator 1040..1069 + /note="T3Te terminator" + /note="phage T3 early transcription terminator" + misc_feature 1143..1148 + /note="BsmBI" + source 5..1149 + /source="Exported" + misc_feature 1184..1195 + /note="Kozak" + CDS 1192..1194 + /codon_start=1 + /note="ATG" + /translation="M" + source 1154..1196 + /source="Exported" + misc_feature complement(1202..1207) + /note="BsmBI" + terminator 1274..1305 + /note="tonB terminator" + /note="bidirectional E. coli tonB-P14 transcription + terminator" + promoter 1306..1408 + /note="cat promoter" + /note="promoter of the E. coli cat gene" + source 1201..1856 + /source="Exported" +ORIGIN + 1 ctcaggcgca atcacgaatg aataacggtt tggttggtgc gagtgatttt gatgacgagc + 61 gtaatggctg gcctgttgaa caagtctgga aagaaatgca taagcttttg ccattctcac + 121 cggattcagt cgtcactcat ggtgatttct cacttgataa ccttattttt gacgagggga + 181 aattaatagg ttgtattgat gttggacgag tcggaatcgc agaccgatac caggatcttg + 241 ccatcctatg gaactgcctc ggtgagtttt ctccttcatt acagaaacgg ctttttcaaa + 301 aatatggtat tgataatcct gatatgaata aattgcagtt tcacttgatg ctcgatgagt + 361 ttttctaatg agggcccaaa tgtaatcacc tggctcacct tcgggtgggc ctttctgcgt + 421 tgctggcgtt tttccatagg ctccgccccc ctgacgagca tcacaaaaat cgatgctcaa + 481 gtcagaggtg gcgaaacccg acaggactat aaagatacca ggcgtttccc cctggaagct + 541 ccctcgtgcg ctctcctgtt ccgaccctgc cgcttaccgg atacctgtcc gcctttctcc + 601 cttcgggaag cgtggcgctt tctcatagct cacgctgtag gtatctcagt tcggtgtagg + 661 tcgttcgctc caagctgggc tgtgtgcacg aaccccccgt tcagcccgac cgctgcgcct + 721 tatccggtaa ctatcgtctt gagtccaacc cggtaagaca cgacttatcg ccactggcag + 781 cagccactgg taacaggatt agcagagcga ggtatgtagg cggtgctaca gagttcttga + 841 agtggtggcc taactacggc tacactagaa gaacagtatt tggtatctgc gctctgctga + 901 agccagttac ctcggaaaaa gagttggtag ctcttgatcc ggcaaacaaa ccaccgctgg + 961 tagcggtggt ttttttgttt gcaagcagca gattacgcgc agaaaaaaag gatctcaaga + 1021 agatcctttg attttctacc gaagaaaggc ccacccgtga aggtgagcca gtgagttgat + 1081 tgcagtccag ttacgctgga gtctgaggct cgtcctgaat gatatcaagc ttgaattcgt + 1141 tacgtctcgc cagaaccgtc agatccgcta gagattacgc caaccgccac catgggcagc + 1201 cgagacgaag acgaattctc tagatatcgc tcaatactga ccatttaaat catacctgac + 1261 ctccatagca gaaagtcaaa agcctccgac cggaggcttt tgacttgatc ggcacgtaag + 1321 aggttccaac tttcaccata atgaaataag atcactaccg ggcgtatttt ttgagttatc + 1381 gagattttca ggagctaagg aagctaaaat gagccatatt caacgggaaa cgtcttgctc + 1441 gaggccgcga ttaaattcca acatggatgc tgatttatat gggtataaat gggctcgcga + 1501 taatgtcggg caatcaggtg cgacaatcta tcgattgtat gggaagcccg atgcgccaga + 1561 gttgtttctg aaacatggca aaggtagcgt tgccaatgat gttacagatg agatggtcag + 1621 gctaaactgg ctgacggaat ttatgcctct tccgaccatc aagcatttta tccgtactcc + 1681 tgatgatgca tggttactca ccactgcgat cccagggaaa acagcattcc aggtattaga + 1741 agaatatcct gattcaggtg aaaatattgt tgatgcgctg gcagtgttcc tgcgccggtt + 1801 gcattcgatt cctgtttgta attgtccttt taacggcgat cgcgtatttc gtctcg +//
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/p6_Nt-IgKLsequence.gb Wed Jun 11 09:42:24 2025 +0000 @@ -0,0 +1,83 @@ +LOCUS . 1886 bp DNA UNK 01-JAN-1980 +DEFINITION . +ACCESSION <unknown id> +VERSION <unknown id> +KEYWORDS . +SOURCE . + ORGANISM . + . +FEATURES Location/Qualifiers + terminator 392..419 + /note="T7Te terminator" + /note="phage T7 early transcription terminator" + rep_origin complement(431..1018) + /direction=LEFT + /note="ori" + /note="high-copy-number ColE1/pMB1/pBR322/pUC origin of + replication" + terminator 1040..1069 + /note="T3Te terminator" + /note="phage T3 early transcription terminator" + misc_feature 1143..1148 + /note="BsmBI" + source 5..1149 + /source="Exported" + misc_feature 1154..1165 + /note="KozaK" + CDS 1162..1164 + /codon_start=1 + /note="ATG" + /translation="M" + CDS 1165..1224 + /codon_start=1 + /note="Ig-kappa leader" + /product="leader sequence from mouse immunoglobulin kappa + light chain" + /translation="ETDTLLLWVLLLWVPGSTGD" + source 1154..1226 + /source="Exported" + misc_feature complement(1232..1237) + /note="BsmBI" + terminator 1304..1335 + /note="tonB terminator" + /note="bidirectional E. coli tonB-P14 transcription + terminator" + promoter 1336..1438 + /note="cat promoter" + /note="promoter of the E. coli cat gene" + source 1231..1886 + /source="Exported" +ORIGIN + 1 ctcaggcgca atcacgaatg aataacggtt tggttggtgc gagtgatttt gatgacgagc + 61 gtaatggctg gcctgttgaa caagtctgga aagaaatgca taagcttttg ccattctcac + 121 cggattcagt cgtcactcat ggtgatttct cacttgataa ccttattttt gacgagggga + 181 aattaatagg ttgtattgat gttggacgag tcggaatcgc agaccgatac caggatcttg + 241 ccatcctatg gaactgcctc ggtgagtttt ctccttcatt acagaaacgg ctttttcaaa + 301 aatatggtat tgataatcct gatatgaata aattgcagtt tcacttgatg ctcgatgagt + 361 ttttctaatg agggcccaaa tgtaatcacc tggctcacct tcgggtgggc ctttctgcgt + 421 tgctggcgtt tttccatagg ctccgccccc ctgacgagca tcacaaaaat cgatgctcaa + 481 gtcagaggtg gcgaaacccg acaggactat aaagatacca ggcgtttccc cctggaagct + 541 ccctcgtgcg ctctcctgtt ccgaccctgc cgcttaccgg atacctgtcc gcctttctcc + 601 cttcgggaag cgtggcgctt tctcatagct cacgctgtag gtatctcagt tcggtgtagg + 661 tcgttcgctc caagctgggc tgtgtgcacg aaccccccgt tcagcccgac cgctgcgcct + 721 tatccggtaa ctatcgtctt gagtccaacc cggtaagaca cgacttatcg ccactggcag + 781 cagccactgg taacaggatt agcagagcga ggtatgtagg cggtgctaca gagttcttga + 841 agtggtggcc taactacggc tacactagaa gaacagtatt tggtatctgc gctctgctga + 901 agccagttac ctcggaaaaa gagttggtag ctcttgatcc ggcaaacaaa ccaccgctgg + 961 tagcggtggt ttttttgttt gcaagcagca gattacgcgc agaaaaaaag gatctcaaga + 1021 agatcctttg attttctacc gaagaaaggc ccacccgtga aggtgagcca gtgagttgat + 1081 tgcagtccag ttacgctgga gtctgaggct cgtcctgaat gatatcaagc ttgaattcgt + 1141 tacgtctcgc cagccgccac catggaaaca gacacactgc tgctatgggt actgctgctc + 1201 tgggttccag gttccactgg tgacagcagc cgagacgaag acgaattctc tagatatcgc + 1261 tcaatactga ccatttaaat catacctgac ctccatagca gaaagtcaaa agcctccgac + 1321 cggaggcttt tgacttgatc ggcacgtaag aggttccaac tttcaccata atgaaataag + 1381 atcactaccg ggcgtatttt ttgagttatc gagattttca ggagctaagg aagctaaaat + 1441 gagccatatt caacgggaaa cgtcttgctc gaggccgcga ttaaattcca acatggatgc + 1501 tgatttatat gggtataaat gggctcgcga taatgtcggg caatcaggtg cgacaatcta + 1561 tcgattgtat gggaagcccg atgcgccaga gttgtttctg aaacatggca aaggtagcgt + 1621 tgccaatgat gttacagatg agatggtcag gctaaactgg ctgacggaat ttatgcctct + 1681 tccgaccatc aagcatttta tccgtactcc tgatgatgca tggttactca ccactgcgat + 1741 cccagggaaa acagcattcc aggtattaga agaatatcct gattcaggtg aaaatattgt + 1801 tgatgcgctg gcagtgttcc tgcgccggtt gcattcgatt cctgtttgta attgtccttt + 1861 taacggcgat cgcgtatttc gtctcg +//
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/p7_L7Ae-Weiss.gb Wed Jun 11 09:42:24 2025 +0000 @@ -0,0 +1,82 @@ +LOCUS . 2169 bp DNA UNK 01-JAN-1980 +DEFINITION . +ACCESSION <unknown id> +VERSION <unknown id> +KEYWORDS . +SOURCE . + ORGANISM . + . +FEATURES Location/Qualifiers + terminator 392..419 + /note="T7Te terminator" + /note="phage T7 early transcription terminator" + rep_origin complement(431..1018) + /direction=LEFT + /note="ori" + /note="high-copy-number ColE1/pMB1/pBR322/pUC origin of + replication" + terminator 1040..1069 + /note="T3Te terminator" + /note="phage T3 early transcription terminator" + misc_feature 1143..1148 + /note="BsmBI" + source 5..1149 + /source="Exported" + CDS 1154..1507 + /codon_start=1 + /note="L7Ae (Weiss)" + /translation="YVRFEVPEDMQNEALSLLEKVRESGKVKKGTNETTKAVERGLAKL + VYIAEDVDPPEIVAHLPLLCEEKNVPYIYVKSKNDLGRAVGIEVPCASAAIINEGELRK + ELGSLVEKIKGLQK" + source 1154..1509 + /source="Exported" + misc_feature complement(1515..1520) + /note="BsmBI" + terminator 1587..1618 + /note="tonB terminator" + /note="bidirectional E. coli tonB-P14 transcription + terminator" + promoter 1619..1721 + /note="cat promoter" + /note="promoter of the E. coli cat gene" + source 1514..2169 + /source="Exported" +ORIGIN + 1 ctcaggcgca atcacgaatg aataacggtt tggttggtgc gagtgatttt gatgacgagc + 61 gtaatggctg gcctgttgaa caagtctgga aagaaatgca taagcttttg ccattctcac + 121 cggattcagt cgtcactcat ggtgatttct cacttgataa ccttattttt gacgagggga + 181 aattaatagg ttgtattgat gttggacgag tcggaatcgc agaccgatac caggatcttg + 241 ccatcctatg gaactgcctc ggtgagtttt ctccttcatt acagaaacgg ctttttcaaa + 301 aatatggtat tgataatcct gatatgaata aattgcagtt tcacttgatg ctcgatgagt + 361 ttttctaatg agggcccaaa tgtaatcacc tggctcacct tcgggtgggc ctttctgcgt + 421 tgctggcgtt tttccatagg ctccgccccc ctgacgagca tcacaaaaat cgatgctcaa + 481 gtcagaggtg gcgaaacccg acaggactat aaagatacca ggcgtttccc cctggaagct + 541 ccctcgtgcg ctctcctgtt ccgaccctgc cgcttaccgg atacctgtcc gcctttctcc + 601 cttcgggaag cgtggcgctt tctcatagct cacgctgtag gtatctcagt tcggtgtagg + 661 tcgttcgctc caagctgggc tgtgtgcacg aaccccccgt tcagcccgac cgctgcgcct + 721 tatccggtaa ctatcgtctt gagtccaacc cggtaagaca cgacttatcg ccactggcag + 781 cagccactgg taacaggatt agcagagcga ggtatgtagg cggtgctaca gagttcttga + 841 agtggtggcc taactacggc tacactagaa gaacagtatt tggtatctgc gctctgctga + 901 agccagttac ctcggaaaaa gagttggtag ctcttgatcc ggcaaacaaa ccaccgctgg + 961 tagcggtggt ttttttgttt gcaagcagca gattacgcgc agaaaaaaag gatctcaaga + 1021 agatcctttg attttctacc gaagaaaggc ccacccgtga aggtgagcca gtgagttgat + 1081 tgcagtccag ttacgctgga gtctgaggct cgtcctgaat gatatcaagc ttgaattcgt + 1141 tacgtctcgc agctacgtga gatttgaggt tcctgaggac atgcagaacg aagctctgag + 1201 tctgctggag aaggttaggg agagcggtaa ggtaaagaaa ggtaccaacg aaacgacaaa + 1261 ggctgtggag aggggactgg caaagctcgt ttacatcgca gaggatgttg acccgcctga + 1321 gatcgttgct catctgcccc tcctctgcga ggagaagaat gtgccgtaca tttacgttaa + 1381 aagcaagaac gaccttggaa gggctgtggg cattgaggtg ccatgcgctt cggcagcgat + 1441 aatcaacgag ggagagctga gaaaggagct tggaagcctt gtggagaaga ttaaaggcct + 1501 tcagaagtca ggccgagacg aagacgaatt ctctagatat cgctcaatac tgaccattta + 1561 aatcatacct gacctccata gcagaaagtc aaaagcctcc gaccggaggc ttttgacttg + 1621 atcggcacgt aagaggttcc aactttcacc ataatgaaat aagatcacta ccgggcgtat + 1681 tttttgagtt atcgagattt tcaggagcta aggaagctaa aatgagccat attcaacggg + 1741 aaacgtcttg ctcgaggccg cgattaaatt ccaacatgga tgctgattta tatgggtata + 1801 aatgggctcg cgataatgtc gggcaatcag gtgcgacaat ctatcgattg tatgggaagc + 1861 ccgatgcgcc agagttgttt ctgaaacatg gcaaaggtag cgttgccaat gatgttacag + 1921 atgagatggt caggctaaac tggctgacgg aatttatgcc tcttccgacc atcaagcatt + 1981 ttatccgtac tcctgatgat gcatggttac tcaccactgc gatcccaggg aaaacagcat + 2041 tccaggtatt agaagaatat cctgattcag gtgaaaatat tgttgatgcg ctggcagtgt + 2101 tcctgcgccg gttgcattcg attcctgttt gtaattgtcc ttttaacggc gatcgcgtat + 2161 ttcgtctcg +//
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test-JSON_arg.json Wed Jun 11 09:42:24 2025 +0000 @@ -0,0 +1,9 @@ +{ + "JSON_table": "sample", + "JSON_sequence_column": "sequence", + "JSON_annotation_column": "annotation", + "JSON_fragment_column": "fragment", + "JSON_db_uri": "postgresql://postgres:RK17@localhost:5432/test_fragments_db", + "execution": "true" +} + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test-JSON_arg_block.json Wed Jun 11 09:42:24 2025 +0000 @@ -0,0 +1,9 @@ +{ + "JSON_table": "sample", + "JSON_sequence_column": "sequence", + "JSON_annotation_column": "annotation", + "JSON_fragment_column": "fragment", + "JSON_db_uri": "postgresql://postgres:RK17@localhost:5432/test_fragments_db", + "execution": "false" +} + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_raport.txt Wed Jun 11 09:42:24 2025 +0000 @@ -0,0 +1,5 @@ +p7_L7Ae-Weiss +p6_Nt-IgKLsequence +p6_Kozak-ATG +p4_Kt-L7Ae-Weiss +HC_Amp_ccdB
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/verification.py Wed Jun 11 09:42:24 2025 +0000 @@ -0,0 +1,20 @@ +from sqlalchemy import create_engine, text + +db_uri = "postgresql://postgres:RK17@localhost:5432/test_fragments_db" # adapt with your URI's DB +engine = create_engine(db_uri) + +with engine.connect() as conn: + result = conn.execute(text(""" + SELECT fragment, sequence, annotation + FROM sample + ORDER BY fragment + """)) + + print("Full contents of fragments in DB:\n") + for row in result: + print(f" Fragment: {row.fragment}") + print(" Sequence:") + print(row.sequence) + print("\n Annotation:") + print(row.annotation) + print("-" * 80)
