# HG changeset patch # User davidvanzessen # Date 1489498216 14400 # Node ID b8ac74723ab0cf991053ea251ec9dbe9efa7675a # Parent ca2512e1e3ab1dbf153622e4abb6719a1f42e166 Uploaded diff -r ca2512e1e3ab -r b8ac74723ab0 merge_and_filter.r --- a/merge_and_filter.r Thu Dec 29 07:05:45 2016 -0500 +++ b/merge_and_filter.r Tue Mar 14 09:30:16 2017 -0400 @@ -36,6 +36,11 @@ colnames(gene_identification) = c("Sequence.ID", "chunk_hit_percentage", "nt_hit_percentage", "start_locations", "best_match") } +print("Summary analysis files columns") +print(names(summ)) + + + input.sequence.count = nrow(summ) print(paste("Number of sequences in summary file:", input.sequence.count)) @@ -70,31 +75,37 @@ filtering.steps = rbind(filtering.steps, c("After functionality filter", nrow(summ))) -#print("mutation analysis files columns") -#print(names(mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])])) +print("mutation analysis files columns") +print(names(mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])])) result = merge(summ, mutationanalysis[,!(names(mutationanalysis) %in% names(summ)[-1])], by="Sequence.ID") print(paste("Number of sequences after merging with mutation analysis file:", nrow(result))) -#print("mutation stats files columns") -#print(names(mutationstats[,!(names(mutationstats) %in% names(result)[-1])])) +print("mutation stats files columns") +print(names(mutationstats[,!(names(mutationstats) %in% names(result)[-1])])) result = merge(result, mutationstats[,!(names(mutationstats) %in% names(result)[-1])], by="Sequence.ID") print(paste("Number of sequences after merging with mutation stats file:", nrow(result))) -#print("hotspots files columns") -#print(names(hotspots[,!(names(hotspots) %in% names(result)[-1])])) +print("hotspots files columns") +print(names(hotspots[,!(names(hotspots) %in% names(result)[-1])])) result = merge(result, hotspots[,!(names(hotspots) %in% names(result)[-1])], by="Sequence.ID") print(paste("Number of sequences after merging with hotspots file:", nrow(result))) +print("sequences files columns") +print(c("FR1.IMGT", "CDR1.IMGT", "FR2.IMGT", "CDR2.IMGT", "FR3.IMGT", "CDR3.IMGT")) + sequences = sequences[,c("Sequence.ID", "FR1.IMGT", "CDR1.IMGT", "FR2.IMGT", "CDR2.IMGT", "FR3.IMGT", "CDR3.IMGT")] names(sequences) = c("Sequence.ID", "FR1.IMGT.seq", "CDR1.IMGT.seq", "FR2.IMGT.seq", "CDR2.IMGT.seq", "FR3.IMGT.seq", "CDR3.IMGT.seq") result = merge(result, sequences, by="Sequence.ID", all.x=T) +print("sequences files columns") +print("CDR3.IMGT") + AAs = AAs[,c("Sequence.ID", "CDR3.IMGT")] names(AAs) = c("Sequence.ID", "CDR3.IMGT.AA") result = merge(result, AAs, by="Sequence.ID", all.x=T) diff -r ca2512e1e3ab -r b8ac74723ab0 wrapper.sh --- a/wrapper.sh Thu Dec 29 07:05:45 2016 -0500 +++ b/wrapper.sh Tue Mar 14 09:30:16 2017 -0400 @@ -421,22 +421,30 @@ if [[ "$fast" == "no" ]] ; then + + echo "---------------- baseline ----------------" echo "---------------- baseline ----------------
" >> $log tmp="$PWD" mkdir $outdir/baseline + echo "

BASELINe

" >> $output + header_substring="Based on CDR1, FR2, CDR2, FR3 (27:27:38:55:65:104:-)" + baseline_boundaries="27:27:38:55:65:104:-" if [[ "${empty_region_filter}" == "leader" ]] ; then baseline_boundaries="1:26:38:55:65:104:-" + header_substring="Based on FR1, CDR1, FR2, CDR2, FR3 (1:26:38:55:65:104,-)" fi + + echo "

${header_substring}

" >> $output mkdir $outdir/baseline/IGA_IGG_IGM if [[ $(wc -l < $outdir/new_IMGT/1_Summary.txt) -gt "1" ]]; then cd $outdir/baseline/IGA_IGG_IGM - bash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "${baseline_boundaries}" $outdir/new_IMGT.txz "IGA_IGG_IGM_IGE" "$dir/baseline/IMGTVHreferencedataset20161215.fa" "$outdir/baseline.pdf" "Sequence.ID" "$outdir/baseline.txt" + bash $dir/baseline/wrapper.sh 1 1 1 1 0 0 "${baseline_boundaries}" $outdir/new_IMGT.txz "IGA_IGG_IGM_IGE" "$dir/baseline/IMGTVHreferencedataset20161215.fa" "$outdir/baseline.pdf" "Sequence.ID" "$outdir/baseline.txt" else echo "No sequences" > "$outdir/baseline.txt" fi