sqlrutils package provides a mechanism for R users to wrap their R script into a TSQL stored procedure, register that stored procedure with a database, and test it from an R development environment. It enables the users to:
InputData (name, defaultQuery = NULL, query = NULL) – S3 object that captures the information about the data that comes into an R data frame from SQL Server.
Parameter details:
InputParameter(name, type, defaultValue = NULL, defaultQuery = NULL, value = NULL, enableOutput = FALSE) – S3 object that captures information about a single input parameter into the R function that is to be embedded into the TSQL script. The user must provide the name and the R type of the input parameter.
Parameter details:
OutputData(name) – S3 object needs to be created if the function that is to be embedded into the TSQL returns a list that contains a dataframe. The OutputData object captures the information about the name of data frame inside the list. The return list can contain at most one data frame.
Parameter details:
OutputParameter (name, type) - S3 object. Must be created if the function that is to be embedded into the TSQL returns a list. It captures the information about a single member of the list that is NOT a data frame. The user must provide the name of the R in the list as well as its R type.
Parameter details:
StoredProcedure (func, spName, …, filePath = NULL ,dbName = NULL, connectionString = NULL, batchSeparator = “GO”) - S3 object. The constructor generates a SQLServer Stored Procedure Object and optionally a .sql file containing a query to create a stored procedure. StoredProcedure $registrationVec contains strings representing the queries needed for creation of the stored procedure.
Parameter details:
getInputParameters(sqlSP) – S3 method that returns a list containing the InputData/Parameter objects with which the StoredProcedure was created. The objects do not contain queries or values associated with them. Those must be set explicitly in order to execute the stored procedure, unless the object contains a default value.
Parameter details:
setInputDataQuery(inputData, query) – S3 method which assigns a query to the InputParameter object (inParam) that is to be passed to executeStoredProcedure in order to execute a stored procedure).
Parameter details:
setInputParameterValue(inParam, value) – S3 method which assigns a value to the InputParameter object (inParam) that is to be passed to executeStoredProcedure in order to execute a stored procedure)
Parameter details:
registerStoredProcedure(sqlSP, connectionString = NULL) – S3 method which registers the stored procedure with the specified database using the registrationVec component of the StoredProcedure object (sqlSP).
Parameter details:
executeStoredProcedure(sqlSP, …, connectionString = NULL) – S3 method which executes the stored procedure represented by the sqlSP object.
Parameter details:
Note:
This function requires a 3.8 ODBC driver. For SQL Server, ODBC Driver 13 for SQL Server is a good choice.
> etl1 <- function() {
+ # The query to get the data
+ qq <- "select top 10000 ArrDelay,CRSDepTime,DayOfWeek from AirlineDemoSmall"
+ # The connection string. For executeStoredProcedure to work, ODBC 3.8 driver is needed.
+ conStr <- paste("Driver={ODBC Driver 13 for SQL Server};Server=.;Database=RevoTestDB;",
+ "Trusted_Connection=Yes;", sep = "")
+ # The data source - retrieves the data from the database
+ dsSqls <- RxSqlServerData(sqlQuery = qq, connectionString = conStr)
+ # The destination data source
+ dsSqls2 <- RxSqlServerData(table ="cleanData", connectionString = conStr)
+ # A transformation function
+ transformFunc <- function(data) {
+ data$CRSDepHour <- as.integer(trunc(data$CRSDepTime))
+ return(data)
+ }
+ # The transformation variables
+ transformVars <- c("CRSDepTime")
+ # set the compute context
+ sqlCompute <- RxInSqlServer(numTasks = 4, connectionString = conStr)
+ rxSetComputeContext(sqlCompute)
+ # drop table if necessary
+ if (rxSqlServerTableExists("cleanData")) {
+ rxSqlServerDropTable("cleanData")
+ }
+ # perform the transformation
+ rxDataStep(inData = dsSqls,
+ outFile = dsSqls2,
+ transformFunc = transformFunc,
+ transformVars = transformVars,
+ overwrite = TRUE)
+ return(NULL)
+ }
>
> # create the sql server stored procedure object
> etlSP1 <- StoredProcedure("etl1", "spETL_ds_to_ds",
+ filePath = "C:\\Users\\user\\Documents",
+ dbName ="RevoTestDB")
> # connection string necessary for registrations and execution
> # since we did not pass it to StoredProcedure
> conStr <- "Driver={ODBC Driver 13 for SQL Server};Server=.;Database=RevoTestDB;Trusted_Connection=Yes;"
> #register the stored procedure with the database
> registerStoredProcedure(etlSP1, conStr)
[1] TRUE
> #execute the stored procedure
> executeStoredProcedure(etlSP1, connectionString = conStr, verbose = TRUE)
exec spETL_ds_to_ds
named list()
> train1 <- function(in_df) {
+ factorLevels <- c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")
+ in_df[,"DayOfWeek"] <- factor(in_df[,"DayOfWeek"], levels=factorLevels)
+ # The model formula
+ formula <- ArrDelay ~ CRSDepTime + DayOfWeek + CRSDepHour:DayOfWeek
+
+ # Train the model
+ mm <- rxLinMod(formula, data = in_df, transformFunc = NULL, transformVars = NULL)
+
+ # Store the model into the database
+ # rdata needs to be created beforehand
+ conStr <- paste0("Driver={ODBC Driver 13 for SQL Server};Server=.;",
+ "Database=RevoTestDB;Trusted_Connection=Yes;")
+ out.table = "rdata"
+ # write the model to the table
+ ds = RxOdbcData(table = out.table, connectionString = conStr)
+
+ rxWriteObject(ds, "linmod.v1", mm, keyName = "key",
+ valueName = "value")
+
+ # the model needs to be serialized before it can be added to the return list
+ mm <- memCompress(serialize(mm, connection = NULL), type="gzip")
+ return(data.frame(mm))
+ }
>
> conStr <- "Driver={ODBC Driver 13 for SQL Server};Server=.;Database=RevoTestDB;Trusted_Connection=Yes;"
> # create an InputData object for the input data frame in_df
> indata <- InputData("in_df",
+ defaultQuery = paste0("select top 10000 ArrDelay,CRSDepTime,",
+ "DayOfWeek,CRSDepHour from cleanData"))
> # create the sql server stored procedure object
> trainSP1 <- StoredProcedure('train1', "spTrain_df_to_df", indata,
+ dbName = "RevoTestDB",
+ connectionString = conStr,
+ filePath = "C:\\Users\\user\\Documents")
> # spRegisterSp and executeStoredProcedure do not require a connection string since we
> # provided one when we created trainSP1
> registerStoredProcedure(trainSP1)
[1] TRUE
> model <- executeStoredProcedure(trainSP1, verbose = TRUE)
exec spTrain_df_to_df
> result2v2 <- rxReadObject(model$data[1,1][[1]])
> result2v2
Call:
rxLinMod(formula = formula, data = in_df, transformFunc = NULL,
transformVars = NULL)
Linear Regression Results for: ArrDelay ~ CRSDepTime + DayOfWeek + CRSDepHour:DayOfWeek
Data: in_df
Dependent variable(s): ArrDelay
Total independent variables: 16 (Including number dropped: 7)
Number of valid observations: 9713
Number of missing observations: 287
Coefficients:
ArrDelay
(Intercept) -12.2606079
CRSDepTime 0.8384946
DayOfWeek=Monday 9.9722108
DayOfWeek=Tuesday 10.7932985
DayOfWeek=Wednesday 0.2561862
DayOfWeek=Thursday Dropped
DayOfWeek=Friday Dropped
DayOfWeek=Saturday Dropped
DayOfWeek=Sunday Dropped
CRSDepHour for DayOfWeek=Monday -0.4449961
CRSDepHour for DayOfWeek=Tuesday -0.6194943
CRSDepHour for DayOfWeek=Wednesday 1.0483718
CRSDepHour for DayOfWeek=Thursday 0.5470396
CRSDepHour for DayOfWeek=Friday Dropped
CRSDepHour for DayOfWeek=Saturday Dropped
CRSDepHour for DayOfWeek=Sunday Dropped
> train2 <- function(in_df) {
+ factorLevels <- c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")
+ in_df[,"DayOfWeek"] <- factor(in_df[,"DayOfWeek"], levels=factorLevels)
+ # The model formula
+ formula <- ArrDelay ~ CRSDepTime + DayOfWeek + CRSDepHour:DayOfWeek
+ # Train the model
+ rxSetComputeContext("local")
+ mm <- rxLinMod(formula, data = in_df, transformFunc = NULL, transformVars = NULL)
+ # the model needs to be serialized before it can be added to the return list
+ mm <- memCompress(serialize(mm, connection = NULL),type="gzip")
+ return(list(mm = mm))
+ }
> # create an InputData object for the input data frame in_df
> indata <- InputData(name = "in_df",
+ query = paste0("select top 10000 ArrDelay,CRSDepTime,",
+ "DayOfWeek,CRSDepHour from cleanData"))
> # create an OutputParameter object for the model inside the return list
> outModelParam <- OutputParameter("mm", "raw")
> trainSP2 <- StoredProcedure(train2, "spTrain_df_to_op", indata, outModelParam,
+ filePath = "C:\\Users\\user\\Documents")
> conStr <- "Driver={ODBC Driver 13 for SQL Server};Server=.;Database=RevoTestDB;Trusted_Connection=Yes;"
> # need to pass connection string to registerStoredProcedure and executeStoredProcedure since we did not
> # provide one when creating trainSP2
> registerStoredProcedure(trainSP2, conStr)
[1] TRUE
> model <- executeStoredProcedure(trainSP2, indata, connectionString = conStr, verbose = TRUE)
exec spTrain_df_to_op @input_data_1_outer = ?, @mm_outer = ?
> rxReadObject(model$params[[1]])
Call:
rxLinMod(formula = formula, data = in_df, transformFunc = NULL,
transformVars = NULL)
Linear Regression Results for: ArrDelay ~ CRSDepTime + DayOfWeek + CRSDepHour:DayOfWeek
Data: in_df
Dependent variable(s): ArrDelay
Total independent variables: 16 (Including number dropped: 7)
Number of valid observations: 9713
Number of missing observations: 287
Coefficients:
ArrDelay
(Intercept) -12.2606079
CRSDepTime 0.8384946
DayOfWeek=Monday 9.9722108
DayOfWeek=Tuesday 10.7932985
DayOfWeek=Wednesday 0.2561862
DayOfWeek=Thursday Dropped
DayOfWeek=Friday Dropped
DayOfWeek=Saturday Dropped
DayOfWeek=Sunday Dropped
CRSDepHour for DayOfWeek=Monday -0.4449961
CRSDepHour for DayOfWeek=Tuesday -0.6194943
CRSDepHour for DayOfWeek=Wednesday 1.0483718
CRSDepHour for DayOfWeek=Thursday 0.5470396
CRSDepHour for DayOfWeek=Friday Dropped
CRSDepHour for DayOfWeek=Saturday Dropped
CRSDepHour for DayOfWeek=Sunday Dropped
The data comes into the function as a data frame, the model comes in as a parameter, the name assigned to the predicted variable comes in as a parameter as well. The prediction is returned as a data frame. P-value becomes an output parameter of the stored procedure. It does not get propagated back to R, however, it can be captured from SSMS.
> # indata - input data frame
> # model_param - serialized model object
> # predVarName - name of the predicted variable
> score1 <- function(in_df, model_param, predVarNameInParam) {
+ factorLevels <- c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")
+ in_df[,"DayOfWeek"] <- factor(in_df[,"DayOfWeek"], levels=factorLevels)
+ mm <- rxReadObject(as.raw(model_param))
+ # Predict
+ result <- rxPredict(modelObject = mm,
+ data = in_df,
+ outData = NULL,
+ predVarNames = predVarNameInParam,
+ extraVarsToWrite = c("ArrDelay"),
+ writeModelVars = TRUE,
+ overwrite = TRUE)
+ return(list(result = result, pvOutParam = mm$f.pvalue))
+ }
>
> # create an InputData object for the input data frame in_df
> indata <- InputData(name = "in_df", defaultQuery = "SELECT top 10 * from cleanData")
> # create InputParameter objects for model_param and predVarNameInParam
> model <- InputParameter("model_param", "raw",
+ defaultQuery = paste("select top 1 value from rdata",
+ "where [key] = 'linmod.v1'"))
> predVarNameInParam <- InputParameter("predVarNameInParam", "character")
> # create OutputData object for the data frame inside the return list
> outData <- OutputData("result")
> # create OutputParameter object for non data frame variable inside the return list
> pvOutParam <- OutputParameter("pvOutParam", "numeric")
> scoreSP1 <- StoredProcedure(score1, "spScore_df_param_df", indata, model, predVarNameInParam, outData, pvOutParam,
+ filePath = "C:\\Users\\user\\Documents")
> conStr <- "Driver={ODBC Driver 13 for SQL Server};Server=.;Database=RevoTestDB;Trusted_Connection=Yes;"
> # connection string necessary for registrations and execution
> # since we did not pass it to StoredProcedure
> registerStoredProcedure(scoreSP1, conStr)
[1] TRUE
> model <- executeStoredProcedure(scoreSP1, predVarNameInParam = "ArrDelayEstimate", connectionString = conStr, verbose = TRUE)
exec spScore_df_param_df @predVarNameInParam_outer = ?, @pvOutParam_outer = ?
> model$data
column1 column2 column3 column4 column5
1 1.8120845 6 9.666666 Monday 9
2 0.4219664 -2 6.416667 Monday 6
3 6.2803173 -2 20.833334 Monday 20
4 5.6771952 -15 19.583334 Monday 19
5 2.1357094 -7 10.583334 Monday 10
6 6.0449431 16 21.083332 Monday 21
7 2.5990813 0 11.666666 Monday 11
8 5.7213215 -9 20.166668 Monday 20
9 0.8853391 2 7.500000 Monday 7
10 1.2788375 19 8.500000 Monday 8
> model$params[[1]]
[1] 4.032124e-154