## title: PrOCoil 2.0: R Example Code ## author: Ulrich Bodenhofer and Annette Jacyszyn ## to run this file as a whole and produce a PDF report, enter the following: ## > install.packages("knitr") ## if not already installed ## > library(knitr) ## > stitch("PrOCoil_Example_Code_V2.R") ## this produces a file PrOCoil_Example_Code_V2.tex which is then compiled ## to a PDF file; to compile the PDF, you need LaTeX installed on your ## computer. ## load packages library(kebabs) library(procoil) ## check version of 'procoil' package: vers <- as.numeric(unlist(strsplit(packageDescription("procoil")$Version, "\\."))) if (vers[1] < 2) stop("at least version 2.0.0 of the 'procoil' package is required") ## definition of function for augmenting PDB data augmentDataSet <- function(PDB, BLAST, mapping) { ## input checks if (!is(PDB, "AAStringSet") || !is(BLAST, "AAStringSet")) stop("'PDB' and 'BLAST' need to be 'AAStringSet' objects") ## perform mapping IDs <- unlist(mapping[names(PDB)]) ## select mapped BLAST sequences out <- BLAST[IDs] ## make unique (multiple PDB sequences may be mapped to the same ## BLAST sequence) str <- paste0(as.character(out), as.character(annotationMetadata(out)), as.character(mcols(out)$Class)) ustr <- unique(str) out <- out[match(ustr, str)] ## perform merge out <- c(PDB, out) ## add required metadata metadata(out)$annotationCharset <- "abcdefg" ## return final result out } ## definition of function for creating a 'CCModel' object from a KeBABS model createCCModel <- function(model) { ## input checks if (!is(model, "KBModel")) stop("'model' needs to be 'KBModel' object") if (!is(model@svmInfo@reqKernel, "GappyPairKernel")) stop("'model' does not use the coiled coil kernel") ## extract feature weights weights <- getFeatureWeights(model) ## sort, arrange and properly name feature weights sel <- order(weights[1, ], decreasing=TRUE) weights <- weights[, sel, drop=FALSE] ## create and return final 'CCModel' object new("CCModel", b=model@b, m=as.integer(model@svmInfo@reqKernel@m), scaling=model@svmInfo@reqKernel@normalized, weights=weights) } ## load PDB data set directly from the Web prefix <- "http://www.bioinf.jku.at/software/procoil/data_v2/" con <- url(paste0(prefix, "PrOCoil_PDB_V2.RData")) load(con) close(con) ## if you have 'PrOCoil_PDB_V2.RData' available locally, ## load it directly with ## > load("PrOCoil_PDB_V2.RData") PDBdataX colnames(mcols(PDBdataX)) ## load BLAST data set directly from the Web con <- url(paste0(prefix, "PrOCoil_BLAST_V2.RData")) load(con) close(con) ## if you have 'PrOCoil_BLAST_V2.RData' available locally, ## load it directly with ## > load("PrOCoil_BLAST_V2.RData") BLASTdataX colnames(mcols(BLASTdataX)) ## load PDB -> BLAST mappings directly from the Web con <- url(paste0(prefix, "PrOCoil_Augmentation_V2.RData")) load(con) close(con) ## if you have 'PrOCoil_Augmentation_V2.RData' available locally, ## load it directly with ## > load("PrOCoil_Augmentation_V2.RData") str(PDB2BLASTmapping) ## define coiled coil kernel with m = 5 CCkernel5 <- gappyPairKernel(k=1, m=5, normalize=TRUE, annSpec=TRUE) ## example showing how to perform grouped cross validation on the PDB data set ## with the folds defined in the metadata column 'Fold' res <- kbsvm(x=PDBdataX, y=mcols(PDBdataX)$Class, kernel=CCkernel5, svm="C-svc", pkg="Liblinear", explicit="yes", cross=10, groupBy=mcols(PDBdataX)$Fold, cost=2) cvResult(res) ## example that trains a model on all folds except no. 8 (training set ## augmented with BLAST sequences) and finally makes a prediction on ## fold no. 8 as test set (PDB sequences only) ## select samples testFold <- which(mcols(PDBdataX)$Fold == 8) trainSet <- PDBdataX[-testFold] testSet <- PDBdataX[testFold] ## augment training set trainSet <- augmentDataSet(trainSet, BLASTdataX, PDB2BLASTmapping) trainSet ## train model model <- kbsvm(x=trainSet, y=mcols(trainSet)$Class, kernel=CCkernel5, svm="C-svc", pkg="Liblinear", explicit="yes", cost=2) ## make prediction pred <- predict(model, testSet) evaluatePrediction(pred, mcols(testSet)$Class) ## example how to train the final PrOCoil model ## augment entire PDB data set mergedSet <- augmentDataSet(PDBdataX, BLASTdataX, PDB2BLASTmapping) mergedSet ## train model model <- kbsvm(x=mergedSet, y=mcols(mergedSet)$Class, kernel=CCkernel5, svm="C-svc", pkg="Liblinear", explicit="yes", cost=2) ## convert to 'CCModel' object pModel <- createCCModel(model) pModel ## note that the SVM algorithm used above has a stochastic component, ## so the resulting model will not be identical to the published model ## 'PrOCoilModel', but it will be very similar and produce the same predictions ## compare with 'PrOCoilModel' contained in 'procoil' R package PrOCoilModel