Mercurial > repos > dcouvin > sraruninfo
changeset 0:1bc09d59c6d7 draft
Uploaded
author | dcouvin |
---|---|
date | Fri, 03 Sep 2021 22:33:14 +0000 |
parents | |
children | d22e2cd2dab5 |
files | SRArunInfo.pl SRArunInfo.xml |
diffstat | 2 files changed, 168 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SRArunInfo.pl Fri Sep 03 22:33:14 2021 +0000 @@ -0,0 +1,131 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +#my $start = time(); + +################################################################ +# Script allowing to get SRA run information using accession ID +# wget command is required to use this script +################################################################ + +# example of use: perl SRArunInfo.pl SRR7693877,SRR9850824,SRR9850830 OR perl SRArunInfo.pl list_accessions.txt + +# options + +print "Welcome to SRA run Info\n"; + +my $runs = $ARGV[0]; + +my @tabRuns = (); + +if($ARGV[0] =~ m/,/ ){ + @tabRuns = split (/,/, $runs); +} +elsif(-e $runs){ + open my $handle, '<', $runs; + chomp(@tabRuns = <$handle>); + close $handle; +} +else{ + push(@tabRuns, $runs); +} + +#@tabRuns = split (/,/, $runs) ; + +my $summary = $ARGV[1]; #"summary_Runs.tsv"; +my $country2 = ""; +my @tabCSV = (); +my %hashCenter = (); + +# Center names (e.g. abbreviation SC means: "The Wellcome Trust Sanger Institute") +$hashCenter{"SC"} = "The Wellcome Trust Sanger Institute"; +$hashCenter{"BI"} = "Broad Institute"; + +open (SUM, ">$summary") or die "open : $!"; + +print SUM "Run\tRelease_Date\tBases (bp)\tAssembly_Name\tTaxonomyID\tScientific_Name (or species)\tCenter_Name\tConsent\tCountry\tLibrary_Strategy\tLibrary_Selection\tLibrary_Source\tLibrary_Layout\tPlatform\tModel\n"; + +for my $run (@tabRuns){ + #$str='String 1GIANT FISHString 2' + #($country)= $str =~ /String 1(.*)String 2/ + + my $first_Cmd = "wget -q -O ./$run.csv 'http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&db=sra&rettype=runinfo&term=$run' "; + + my $second_Cmd = "wget -q -O ./$run.html \"https://www.ncbi.nlm.nih.gov//sra?term=($run)%20NOT%20cluster_dbgap%5BPROP%5D&report=FullXml\" "; + + system($first_Cmd); + system($second_Cmd); + + open (XML, "<$run.html") or die "open : $!"; + while (<XML>) { + chomp(); + my $string1 = "Country></b>"; + my $string2 = "<b>"; + $_ =~ /$string1(.*?)$string2/; #/<div class="xml-tag"><b>\<Country\><\/b>(.*)<b>\</; + if($1){ + $country2 = $1; # my $country $country[0]; + } + else{ + $country2 = "ND"; + } + #open COUNTRY, "echo $_ | grep -oP '(?<=Country></b>).*?(?=<b>)' "; # Non-greedy match (Notice the '?' after '*' in .*) + #while (<COUNTRY>) { + # chomp(); + # print $_; + #} + #my ($substr) = ($string =~ /period_1_(.*)\.ssa/); + } + close (XML) or die "close file error : $!"; + + open (RUNXML, "<$run.html") or die "open : $!"; + if ($country2 eq "ND"){ + while (<RUNXML>) { + chomp(); + my $ostring1 = "</b>geographic location (country and/or sea region)<b></TAG></b></div><div class=\"xml-tag\"><b><VALUE></b>"; + my $ostring2 = "<b><"; + $_ =~ /$ostring1(.*?)$ostring2/; #/<div class="xml-tag"><b>\<Country\><\/b>(.*)<b>\</; + if($1){ + print "Country2 = ".$1."\n"; + $country2 = $1; # my $country $country[0]; + } + else{ + $country2 = "ND"; + } + + } + } + close (RUNXML) or die "close file error : $!"; + + #</b>geographic location (country and/or sea region)<b></TAG></b></div><div class="xml-tag"><b><VALUE></b> + #<b>< + #close (COUNTRY) or die "close file error : $!"; + + open (CSV, "<$run.csv") or die "open : $!"; + while (<CSV>) { + chomp(); + if ($_ =~ m/$run/) { + @tabCSV = split (/,/, $_) ; + } + } + close (CSV) or die "close file error : $!"; + + my $tmpCenter = ""; + if($hashCenter{$tabCSV[41]}) { $tmpCenter = $hashCenter{$tabCSV[41]}; } + + print SUM "$run\t$tabCSV[1]\t$tabCSV[4]\t$tabCSV[8]\t$tabCSV[27]\t$tabCSV[28]\t$tabCSV[41] ($tmpCenter)\t$tabCSV[44]\t$country2\t$tabCSV[12]\t$tabCSV[13]\t$tabCSV[14]\t$tabCSV[15]\t$tabCSV[18]\t$tabCSV[19]\n"; + +} + + +close (SUM) or die "close file error : $!"; + +#my $end = time(); + +#my $total = $end - $start; + +#print "***** Total time (in seconds) is: $total *****\n"; +unlink glob ('*.html'); +unlink glob ('*.csv'); +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SRArunInfo.xml Fri Sep 03 22:33:14 2021 +0000 @@ -0,0 +1,37 @@ +<tool id="sra_run_info" name="SRA run Info" version="1.0"> + <description>Provide information from SRR/ERR run accessions</description> + <!-- <requirements> + <requirement type="package" version=""></requirement> +</requirements> --> + <command detect_errors="aggressive"><![CDATA[ + + perl ${__tool_directory__}/SRArunInfo.pl $input $output + + ]]></command> + + + + <inputs> + <param name="input" type="text" area="true" value='SRR7693877,SRR9850824,SRR9850830' label="SRR/ERR accession number" help="Comma separated list of IDs (without spaces between IDs)"/> + + <!--<param format="tabular" name="input_files" type="data" multiple="true" label="Annotation files"/>--> + </inputs> + + <outputs> + <data format="tabular" name="output" label="Summary file"/> + <!--<data format="tabular" name="output2" label="HTML file"/>--> + </outputs> + + <help><![CDATA[ +SRArunInfo.pl is a Perl script allowing to provide information from SRR/ERR run accessions + +This script belongs to the getSequenceInfo supplementary tools + +Example of input: SRR7693877,SRR9850824,SRR9850830 + +- GitHub: https://github.com/karubiotools/getSequenceInfo/tree/master/supplementary_tools +]]> + </help> + + +</tool>