Mercurial > repos > venice-juanillas > convert_format
changeset 2:56d85d5e870a draft default tip
Deleted selected files
author | venice-juanillas |
---|---|
date | Tue, 06 Nov 2012 02:42:00 -0500 |
parents | 255735871a14 |
children | |
files | file_conversion/alchemy2matrix.pl file_conversion/alchemy2matrix.xml file_conversion/matrix2powermarker.pl file_conversion/matrix2powermarker.xml file_conversion/matrix2qgene.pl file_conversion/matrix2qgene.xml file_conversion/matrix2structure.pl |
diffstat | 7 files changed, 0 insertions(+), 842 deletions(-) [+] |
line wrap: on
line diff
--- a/file_conversion/alchemy2matrix.pl Mon Nov 05 23:01:59 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,172 +0,0 @@ -#!C:\Perl\bin\perl -use warnings; - -################################################################################### -## Author: Venice Margarette B. Juanillas -## Date Created: May 10,2011 -## Program Description: This script will read an alchemy-derived text output -## and transform it into the matrix form getting only -## the ff: -## Sample names -## SNP ID -## AB Calls -## This matrix will further be utilized by other formats. -## -## Citation: This script was made based from Sir Mauleon's existing script. -################################################################################### - -## declarations -my $line_count = 1; -my $line; -my $index; -my $hline = 1; ## flag if header line -my $prev; -my $firstSNP = 0; #marks the number of samples read -my $nt; ## for the new symbol to be used -my @row = (); ## declare all arrays null -my @string = (); -my @sample_id = (); -my @nullarray = (); - -check(); - -sub check{ - ## check for file parameters - my $param_num = scalar(@ARGV); - if(!@ARGV or ($param_num < 2) or ($param_num > 3)){ - print "Enter input, output files and column number respectively.\n"; - }elsif(!$ARGV[2]){ - $ARGV[2] = 3; - main($ARGV[0],$ARGV[1],$ARGV[2]); ## default column parameter: 3 - }else{ - main($ARGV[0],$ARGV[1],$ARGV[2]); ## isama na ang col number para dynamic ang paggawa ng matrix - } -} - -sub main{ - my ($infile,$outfile,$col) = @_; - - $index = $col; ## column to be used made global - - #open input and output files - open(IN,"$infile") || die "Cannot open file."; - open(OUT,">$outfile") || die "Cannot open file."; - - #read one line at a time - while($line = <IN>){ - next if $line =~ /^\s*$/; # skip blank lines - if ($line_count <= 6){ - $line_count++; - next; - } - @row = split(/\t/,$line); #split every line delimited by tab or newline - # if $line is already in the first data line - if($hline == 1){ - push(@Sample_id, $row[1]); # create the stack of Sample id's - push(@string,$row[0]); #create the stack of SNP id's - pushGenotype(); # call to a subroutine - $prev = $row[0]; #replace prev SNP name with current SNP name - $hline = 0; ## flag as not 1st data line anymore - $firstSNP =1 ; - next; - } - #if within the sample SNP - if($row[0] eq $prev){ - push(@Sample_id,$row[1]); - pushGenotype(); - next; - } - #if out of the prev SNP, go and read the next SNP - if ($row[0] ne $prev) { - if ($hline != 1) { - if ($firstSNP == 1) { ## ung 1st pass lang dapat - print OUT "SNP_Sample_ids"; ## print to output file... - - foreach (@Sample_id) { - print OUT "\t$_"; - } ##end for each - print OUT "\n"; - $firstSNP = 0 ; - } - - ##print the genotype string.. - foreach (@string) { - print OUT "$_\t"; - } ## end foreach - print OUT "\n"; - @string = @nullarray; - } - - push (@string, $row[0]); #push SNP name as 1st array element - $prev = $row[0]; #replace prev SNP name with current SNP name - $firstSNP = 0 ; - pushGenotype(); - next; - - } ## end if currline <> prev - - } - - ## output lahat ng SNP names - foreach (@string) { - print OUT "$_\t"; - } - print OUT "\n"; - - #close all files - close(IN); - close(OUT); -} - - -## this subroutine will assign the AB calls sor every SNP and Sample id's -## based from Sir Mau's pushGenotype code -sub pushGenotype { - - $nt = $row[$index]; ##assign AB call column na pinipick-up - - ## check for all possible genotypes - if ($row[$index] eq "AA") { - $nt = "A/A"; ## overwrite $nt by assigning a new notation - } - if ($row[$index] eq "BB") { - $nt = "B/B"; - } - if ($row[$index] eq "AB" || $row[$index] eq "BA") { - $nt = "A/B"; - } - if ($row[$index] eq " ") { - $nt = "-/-"; - } - if ($row[$index] eq "TT") { - $nt = "T/T"; - } - if ($row[$index] eq "CC") { - $nt = "C/C"; - } - if ($row[$index] eq "TC" || $row[$index] eq "CT") { - $nt = "T/C"; - } - if ($row[$index] eq "GG") { - $nt = "G/G"; - } - if ($row[$index] eq "GC" || $row[$index] eq "CG") { - $nt = "C/G"; - } - if ($row[$index] eq "GA" || $row[$index] eq "AG") { - $nt = "A/G"; - } - if ($row[$index] eq "TA" || $row[$index] eq "AT") { - $nt = "A/T"; - } - push (@string, $nt); ## push the new symbol to the array - return; - -} - -## end of script - - - - -
--- a/file_conversion/alchemy2matrix.xml Mon Nov 05 23:01:59 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,22 +0,0 @@ -<tool id="alchemy2matrix" name="Alchemy to Matrix"> - <description>file converter</description> - <command interpreter="perl">alchemy2matrix.pl $input $output</command> - <inputs> - <param format="tabular" name="input" type="data" label="Source file"/> - </inputs> - <outputs> - <data format="tabular" name="output" /> - </outputs> - - <tests> - <test> - <param name="input" value="raw_data.txt"/> - <output name="out_file1" file="output.txt"/> - </test> - </tests> - - <help> -This tool converts an alchemy-output file format to a SNP matrix format. - </help> - -</tool> \ No newline at end of file
--- a/file_conversion/matrix2powermarker.pl Mon Nov 05 23:01:59 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,71 +0,0 @@ -#!C:\Perl\bin\perl -use warnings; - -##################################################### -## Author: Venice Margarette B. Juanillas -## Date: May 13,2011 -## Program Description: This script will transform a matrix into a Powerformat dataset -## this will utilize a matrix and transpose this matrix -## The column names will become the rows, the SNP ids will be the new -## columns -############################################################################################# - -## declarations -my $line; -my $line_count = 1; -my $temp; -my $next_line = 0; -my @row = (); -my @array = (); -my @string = (); -my @SNP_ids = (); -my @Sample_names = (); - -## check for file parameters -if(!@ARGV or scalar(@ARGV) != 2){ - print "No Parameters specified...Specify 1.) input data 2.) output file\n"; -}else{ - main($ARGV[0],$ARGV[1]); ## call to subroutine main -} - -sub main{ - my ($infile, $outfile) = @_; - - #open input and output files - open(IN, "$infile")|| die "Cannot open input file specified."; - open(OUT, ">$outfile")|| die "Cannot open input file specified."; - - #read line by line - while($line = <IN>){ - next if $line =~ /^\s*$/; # skip blank lines - if($line_count == 1){ - @Sample_names = split(/\t|\n/,$line); ## split the header, which contains the sample names and store them to an array - $next_line =1; ## flag to got to the next line - $line_count++; - next; - } - if($next_line == 1){ ## reading the next line - @row = split(/\t|\n/,$line); ## slits all elements delimited by the tabs and newlines - push(@SNP_ids,$row[0]); ## store all ids - for($i = 0; $i < $#Sample_names; $i++){ - $Sample_names[$i] = $Sample_names[$i]."\t".$row[$i]; ## append all data markers to their respective sample names - } - } - } - - #this is segment is mainly for outputting the transposed matrix into the file - #for($i = 0;$i <= $#SNP_ids;$i++){ - # print OUT "$SNP_ids[$i]\t"; - #} - #print OUT "\n"; - for($i = 0;$i < $#Sample_names;$i++){ ## output to file all contents of the array - print OUT $Sample_names[$i]."\n"; ## basically all that's in the matrix - } - - ## close files to save modifications - close(IN); - close(OUT); - -} - -## end of script \ No newline at end of file
--- a/file_conversion/matrix2powermarker.xml Mon Nov 05 23:01:59 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ -<tool id="matrix2powermarker" name="Matrix to Powermarker"> - <description>file conversion</description> - <command interpreter="perl">matrix2powermarker.pl $input $output</command> - <inputs> - <param format="tabular" name="input" type="data" label="Matrix file"/> - </inputs> - <outputs> - <data format="tabular" name="output" label="${input.name} Powermarker Format"/> <!--change labels to create more informative output names--> - </outputs> - - <tests> - <test> - <output name="output" file="out.txt"/> - </test> - </tests> - - <help> -This tool converts a SNP matrix file to Powermarker file format. - </help> - -</tool> \ No newline at end of file
--- a/file_conversion/matrix2qgene.pl Mon Nov 05 23:01:59 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,391 +0,0 @@ -#!C:\PERL\bin\perl -use warnings; -#use Benchmark; ## for benchmark purposes - -#$t0 = Benchmark->new; ## Create new instace of Benchmark variable - ## begin - - -######################################################################### -## Author: Venice Margarette B. Juanillas -## Date Created: May 19, 2011 -## Description: This script transforms a tabular data file into the -## QGENE data file format -## outputs: [header] -## Genotype Symbols: -## A/A = 1 -## B/B = 2 -## A/B = 3 -## C/C = 4 -## D/D = 5 -## -/- = 0 -## [locus] -## SNP_id cM_num cM_pos marker_data(123450) -## [Trait] -## *must still be appended; generated from another -## *file. -## temporary output: out2.txt -## final output: output.txt -############################################################################# - -## declarations -my $tempf1 = "out.txt"; ## global variables that will act as temporary files for the sort -my $tempf2 = "out2.txt"; -my $tempf3 = "out3.txt"; ## file to temporarily store the matrix - -my $linecounter = 0; ## this variable will keep track of the lines that are already occupied - - -check(); - -## this subroutine will check if there are sufficient paramters issued -sub check{ - - my $params = scalar(@ARGV); ##store the number of parameters - my $string; - my $append_how; ## prepend or append?? - - ##check for command line parameters specified - ##else prompt an error asking for files - if($params < 3 or $params > 5){ - print "Incorrect file parameters...\n"; - print "ARGV[0] = matrix file\n"; ##user created - print "ARGV[1] = output file\n"; - print "ARGV[2] = map file\n"; ##user created - print "ARGV[3] = header file\n"; - print "ARGV[4] = trait file\n"; - }else{ - sort_contents($ARGV[0],$tempf1); ## sort yung laman ng matrix - sort_contents($ARGV[2],$tempf2); ## sort ang laman ng map - create_matrix($tempf1,$tempf2,$tempf3); ## creates the matrix in another file - - ## check if header and trait files exist - if(!$ARGV[3] or !$ARGV[4]){ - if(!$ARGV[3]){ - print "No header section\n"; - $string = "[Header]"."\n"."[Locus]"; - $append_how = "+>"; ## prepend symbol - append_string($append_how.$ARGV[1],$string); - } - - append_matrix($ARGV[1],$tempf3); ## append the matrix - - if(!$ARGV[4]){ - print "No trait section\n"; - $string = "[Trait]"; - $append_how = "+>>"; ## append symbol - append_string($append_how.$ARGV[1],$string); - } - }else{ - append_header($ARGV[3],$ARGV[1]); ##subroutine calls - append_matrix($ARGV[1],$tempf3); - append_trait($ARGV[1],$ARGV[4]); - } - } -} - -## this subroutine will read the matrix from the temporary file -## and append it to the output file -sub append_matrix{ - my ($output,$matrix) = @_; - my $line; - my $linecount = 1; - my $to_string; - my @row; - my @lines; - - ## open files - open(IN,"$matrix")|| die "Cannot open $matrix"; - open(OUT,"+>>$output") || die "Cannot open $output"; - - while($line = <IN>){ - next if $line=~ /^\s*$/; ## skip if blank line - if($linecount == 1){ - $linecount++; - next; - }else{ - @row = split(/\t/,$line); ## split row - ($row[0],$row[1])= ($row[1],$row[0]); ## swap cM num and SNP - $to_string = join("\t",@row); - print OUT $to_string; ## append row by row - } - } - - ##close files - close(IN); - close(OUT); -} - -## this subroutine will create the matrix of the locus section -## however, this will create the matrix in a temporary file -sub create_matrix{ - my ($input1,$input2,$output) = @_; ## input1: matrix input2: map input3: temp_out - my $line; - my $linecount = 1; ## flag if header - my $count = 0; - my $string; - my $hit = 0; ## flag if SNP id found - my @row; - my @markers; - my @SNP_rows; - my $alleles; - - open(IN1,"$input1") || die "Cannot open $input1"; - open(OUT,">$output") || die "Cannot open $output"; - - while($line = <IN1>){ - next if $line =~ /^\s*$/; ## we skip the blank lines - - ## skip the headers - if($linecount == 1){ - $linecount++; - next; - }elsif($linecount ge 1){ - @row = split(/\t/,$line); - $hit = search_SNPid($row[0],$input2); ## search if SNP in matrix has mappings - $alleles = get_alleles($row[0],$input1); - - ## if SNP is has a cM mapping - if($hit == 1){ - $string = get_mappings($row[0],$input2); ## get the chromosome mappings - }else{ - - $string = "-"."\t".$row[0]."\t"."-"."\t"; ## if there exist no mappings,cM & cM_pos= "-" - } - $SNP_rows[$count] = $string.$alleles."\n"; - $linecount++; - $count++; - } - } - ## sort the SNPs by their chromosomes - @SNP_rows = sort(@SNP_rows); - - ## print SNPs in temporary file - for($i = 0; $i < $#SNP_rows; $i++){ - print OUT $SNP_rows[$i]; - } - - close(IN1); - close(OUT); -} - -## this will form a string that comprise only of the alleles of a certain SNP id -sub get_alleles{ - my ($SNP, $file) = @_; - my $line; - my $linecount = 1; ## flag if we are in the header row - my @row; - my @markers; ## store here all marker data - my $str; - - open(IN,"$file") || die "Cannot open $file."; - - while($line = <IN>){ - next if $line =~ /^\s*$/; - if($linecount == 1){ ## skip the header - $linecount++; - next; - } - if($linecount != 1){ - @row= split(/\t/,$line); ## split whole row - if($row[0] eq $SNP){ ## we change the genotype symbols from ABHCDx : 1234560 - for($i = 1; $i < $#row; $i++){ - if ($row[$i] eq "A/A"){ - $row[$i] = "1" - } - if ($row[$i] eq "B/B"){ - $row[$i] = "2" - } - if ($row[$i]eq "A/B"){ - $row[$i] = "3" - } - if ($row[$i] eq "C/C"){ - $row[$i] = "4" - } - if ($row[$i] eq "D/D"){ - $row[$i] = "5" - } - if ($row[$i] eq "-/-"){ - $row[$i] = "6" - } - push(@markers,$row[$i]); ## push all into an array - } - } - } - } - $str = join("\t",@markers); ## delimit the markers by tabs - - close(IN); - return $str; ## return the string -} - -## this subroutine will get the cM, cM position and the SNP ids -sub get_mappings{ - my ($SNP,$file) = @_; - my $line; - my $mappings; - my $linecount = 1; ## flag if header - my @row; - - open(IN,$file) || die $!; - - while($line = <IN>){ - next if $line =~ /^\s*$/; ## we skip the blank lines - if($linecount == 1){ - $linecount++; - next; - }else{ - @row = split(/\t/,$line); - if($row[0] eq $SNP){ - $mappings = $row[1]."\t".$row[0]."\t".$row[2]."\t"; - return $mappings; - } - $linecount++; - } - } - - close(IN); -} - -## this subroutine will append the marker data -## in the matrix -sub change_marker{ - my ($marker) = @_; - - if ($marker eq "A/A"){ - $marker = "1" - } - if ($marker eq "B/B"){ - $marker = "2" - } - if ($marker eq "A/B"){ - $marker = "3" - } - if ($marker eq "C/C"){ - $marker = "4" - } - if ($marker eq "D/D"){ - $marker = "5" - } - if ($marker eq "-/-"){ - $marker = "6" - } - - return $marker; ## return the string -} - -## this subroutine checks if the SNP is in the cM mapping -## drawback: this will be slow... -## what if millions of SNPs? very, very slow -## Solution: Search for the fastest search algo.... -sub search_SNPid{ - my ($to_search,$file_to_search )= @_; - my $line; - my $flag = 0; ## mark if the SNP id exists in the map - my $linecount = 1; - - open(FILE,"$file_to_search") || die "Cannot open $file_to_search."; - - while($line = <FILE>){ - next if $line =~ /^\s*$/; - if($linecount == 1){ ## we skip the headers - $linecount++; - } - elsif($linecount != 1){ - @row= split(/\t|\n/,$line); - if($row[0] eq $to_search){ ## compare with the id in the map - $flag = 1; - } - } - } - close(FILE); - - return $flag; - -} - -## this subroutine will sort its contents based on SNPs -## for faster searching -sub sort_contents{ - my ($infile,$outfile) = @_; - my @array; - - ## open the files - open(IN,"$infile") || die "Cannot open $infile"; - open(OUT,"+>$outfile") || die "Cannot open $outfile"; ## we keep the sorted contents in the temporary files - - @array = <IN>; - @array = sort(@array); ## sort - print OUT @array; ## then output all - - ## close to save - close(IN); - close(OUT); -} - -## this will append any given string to a given file -sub append_string{ - my ($file, $string) = @_; - my $line; - - open(IN,"$file") || die "Cannot open $file"; ## open file - print IN $string."\n"; ##write/append to the file - close(IN); ## close -} - - -## this subroutine will get the header from header.txt -## and write it to the output file -sub append_header{ - my ($infile,$outfile) = @_; - my $line; - - ## open files - open(IN,"$infile") || die "Cannot open file."; - open(OUT,">$outfile") || die "Cannot open file."; - - - ##traverse through the input file - while($line = <IN>){ - next if $line =~ /^\s*$/; - print OUT $line; - $linecounter++; - } - - print OUT "\n"; - ## close all files to save modifications - close(IN); - close(OUT); - -} - -## this subroutine will append the Trait Section -sub append_trait{ - my ($file,$trait_file) = @_; - my $line; - - ##open file for manipulation - open(FILE,"+>>$file") || die "Cannot open $file"; - open(F,"+<$trait_file") || die "Cannot open $file"; - - ## read contents of trait file and copy to the output file - while($line = <F>){ - print FILE $line; - } - - ##close files to save all modifications - close(FILE); - close(F); -} - -#$t1 = Benchmark->new; -#$td = timediff($t1, $t0); -#print "the code took:",timestr($td),"\n"; - - - -## end of the script -## Date Completed: May 23,2011 -## Revised: May 28,2011 -## Remarks: For further testing -
--- a/file_conversion/matrix2qgene.xml Mon Nov 05 23:01:59 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ -<tool id="matrix2qgene" name="Matrix to QGene"> - <description>file format conversion</description> - <command interpreter="perl">matrix2qgene.pl $matrix $output $map $header $trait</command> - <inputs> - <param format="tabular" name="matrix" type="data" label="Matrix file"></param> - <param format="tabular" name="map" type="data" label="Physical Map file"></param> - <param format="tabular" name="header" type="data" label="Header file"></param> - <param format="tabular" name="trait" type="data" label="Trait file"></param> - </inputs> - <outputs> - <data format="tabular" name="output" label="${matrix.name} QGene format"/> - </outputs> - - <tests> - <test> - <output name="out1" file="${matrix}.qdf"/> - </test> - </tests> - - <help> -This tool converts a SNP matrix file to QGene file format. - </help> - -</tool> \ No newline at end of file
--- a/file_conversion/matrix2structure.pl Mon Nov 05 23:01:59 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,141 +0,0 @@ -#!C:\Perl\bin\perl -use warnings; - -########################################################################################## -## Author: Venice Margarette B. Juanillas -## Date Created: May 27,2011 -## Script Description: This script is intended to transform a matrix into its structure -## data file format. -## Disclaimer: This script is not yet complete. This script cannot still fully be used -## for Structure transformation. -########################################################################################## - - - -my $string; -my $temp_file; ## temporary file that will store the matrix before we append - ## to the output file -my @array; - - -## check for file parameters -if(!@ARGV or scalar(@ARGV) != 2){ - print "No Parameters specified...Specify 1.) matrix input data 2.) output file\n"; -}else{ - $temp_file = "line.txt"; - transpose_data($ARGV[0],$temp_file); - change_symbol($temp_file,$ARGV[1]); -} - -## transposes matrix data -## Marker names will become the new rows -## SNP_id will become the columns -sub transpose_data{ - my ($infile, $outfile) = @_; - my $line; - my $line_count = 1; ## start from the header - my $next_line = 0; ## flag jump to next line - my @row ; - my @markers; - - - ## open files - open(IN, "$infile")|| die "Cannot open $infile"; - open(OUT, ">$outfile")|| die "Cannot open $outfile "; - - ## read file per line - while($line = <IN>){ - next if $line =~ /^\s*$/; # skip blank lines - if($line_count == 1){ - @markers = split(/\t/,$line); ## split the header, which contains the sample names and store them to an array - $next_line =1; ## flag to got to the next line - $line_count++; - next; - } - if($next_line == 1){ ## go to the next line - @row = split(/\t/,$line); ## split line - for($i = 0; $i < $#row; $i++){ - - $markers[$i] = $markers[$i]."\t".$row[$i]; ## append all alleles to their respective sample names/markers - } - - } - } - - for($i = 0;$i < $#markers;$i++){ ## output to file all contents of the array - print OUT $markers[$i]."\n"; ## basically all that's in the matrix - } - - ## close files - close(IN); - close(OUT); -} - -sub change_symbol{ - my ($file1,$file2) = @_; - my $line; - my $linecount = 1; - my $new; - my @row; - my @alleles; - my @header; - - open(IN,"$file1") || die "Cannot opne $file1"; - open(OUT,">$file2") || die "Cannot opne $file2"; - - - while($line = <IN>){ - next if $line =~ /^\s*$/; # skip blank lines - if ($linecount == 1){ - #@header = split(/\t|\s|\n/,$line); - print OUT $line; - $linecount++; - next; - }else{ - @row = split(/\t|\n|\s/,$line); - for($i = 0;$i < $#row;$i++){ - if($row[$i] eq "-/-"){ - $row[$i] = "N"; - } - if($row[$i] eq "A/A"){ - $row[$i] = "A"; - } - if($row[$i] eq "C/C"){ - $row[$i] = "C"; - } - if($row[$i] eq "A/B" || $row[$i] eq "B/B"){ - $row[$i] = "B"; - } - if($row[$i] eq "C/G" || $row[$i] eq "G/G"){ - $row[$i] = "G"; - } - if($row[$i] eq "A/T"||$row[$i] eq "T/T"){ - $row[$i] = "AT"; - } - if($row[$i] eq "T/C"){ - $row[$i] = "TC"; - } - if($row[$i] eq "B/C"){ - $row[$i] = "BC"; - } - if($row[$i] eq "A/G"){ - $row[$i] = "AG"; - } - - #push(@alleles,$row[$i]); - } - $myline = join("\t",@row); - } - print OUT $myline."\n"; - } - - - close(IN); - close(OUT); -} - - -## 1.) This still needs further familiarization of the haploid, diploid, n-row formats -## 2.) How are alleles distributed per loci? - -