Mercurial > repos > dcouvin > sraruninfo
view SRArunInfo.pl @ 1:d22e2cd2dab5 draft default tip
Uploaded
author | dcouvin |
---|---|
date | Fri, 03 Sep 2021 23:08:48 +0000 |
parents | 1bc09d59c6d7 |
children |
line wrap: on
line source
#!/usr/bin/perl use strict; use warnings; #my $start = time(); ################################################################ # Script allowing to get SRA run information using accession ID # wget command is required to use this script ################################################################ # example of use: perl SRArunInfo.pl SRR7693877,SRR9850824,SRR9850830 OR perl SRArunInfo.pl list_accessions.txt # options print "Welcome to SRA run Info\n"; my $runs = $ARGV[0]; my @tabRuns = (); if($ARGV[0] =~ m/,/ ){ @tabRuns = split (/,/, $runs); } elsif(-e $runs){ open my $handle, '<', $runs; chomp(@tabRuns = <$handle>); close $handle; } else{ push(@tabRuns, $runs); } #@tabRuns = split (/,/, $runs) ; my $summary = $ARGV[1]; #"summary_Runs.tsv"; my $country2 = ""; my @tabCSV = (); my %hashCenter = (); # Center names (e.g. abbreviation SC means: "The Wellcome Trust Sanger Institute") $hashCenter{"SC"} = "The Wellcome Trust Sanger Institute"; $hashCenter{"BI"} = "Broad Institute"; open (SUM, ">$summary") or die "open : $!"; print SUM "Run\tRelease_Date\tBases (bp)\tAssembly_Name\tTaxonomyID\tScientific_Name (or species)\tCenter_Name\tConsent\tCountry\tLibrary_Strategy\tLibrary_Selection\tLibrary_Source\tLibrary_Layout\tPlatform\tModel\n"; for my $run (@tabRuns){ #$str='String 1GIANT FISHString 2' #($country)= $str =~ /String 1(.*)String 2/ my $first_Cmd = "wget -q -O ./$run.csv 'http://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?save=efetch&db=sra&rettype=runinfo&term=$run' "; my $second_Cmd = "wget -q -O ./$run.html \"https://www.ncbi.nlm.nih.gov//sra?term=($run)%20NOT%20cluster_dbgap%5BPROP%5D&report=FullXml\" "; system($first_Cmd); system($second_Cmd); open (XML, "<$run.html") or die "open : $!"; while (<XML>) { chomp(); my $string1 = "Country></b>"; my $string2 = "<b>"; $_ =~ /$string1(.*?)$string2/; #/<div class="xml-tag"><b>\<Country\><\/b>(.*)<b>\</; if($1){ $country2 = $1; # my $country $country[0]; } else{ $country2 = "ND"; } #open COUNTRY, "echo $_ | grep -oP '(?<=Country></b>).*?(?=<b>)' "; # Non-greedy match (Notice the '?' after '*' in .*) #while (<COUNTRY>) { # chomp(); # print $_; #} #my ($substr) = ($string =~ /period_1_(.*)\.ssa/); } close (XML) or die "close file error : $!"; open (RUNXML, "<$run.html") or die "open : $!"; if ($country2 eq "ND"){ while (<RUNXML>) { chomp(); my $ostring1 = "</b>geographic location (country and/or sea region)<b></TAG></b></div><div class=\"xml-tag\"><b><VALUE></b>"; my $ostring2 = "<b><"; $_ =~ /$ostring1(.*?)$ostring2/; #/<div class="xml-tag"><b>\<Country\><\/b>(.*)<b>\</; if($1){ print "Country2 = ".$1."\n"; $country2 = $1; # my $country $country[0]; } else{ $country2 = "ND"; } } } close (RUNXML) or die "close file error : $!"; #</b>geographic location (country and/or sea region)<b></TAG></b></div><div class="xml-tag"><b><VALUE></b> #<b>< #close (COUNTRY) or die "close file error : $!"; open (CSV, "<$run.csv") or die "open : $!"; while (<CSV>) { chomp(); if ($_ =~ m/$run/) { @tabCSV = split (/,/, $_) ; } } close (CSV) or die "close file error : $!"; my $tmpCenter = ""; if($hashCenter{$tabCSV[41]}) { $tmpCenter = $hashCenter{$tabCSV[41]}; } print SUM "$run\t$tabCSV[1]\t$tabCSV[4]\t$tabCSV[8]\t$tabCSV[27]\t$tabCSV[28]\t$tabCSV[41] ($tmpCenter)\t$tabCSV[44]\t$country2\t$tabCSV[12]\t$tabCSV[13]\t$tabCSV[14]\t$tabCSV[15]\t$tabCSV[18]\t$tabCSV[19]\n"; } close (SUM) or die "close file error : $!"; #my $end = time(); #my $total = $end - $start; #print "***** Total time (in seconds) is: $total *****\n"; unlink glob ('*.html'); unlink glob ('*.csv');