makeX.Rd
Converts a data frame to a data matrix suitable for input to glmnet
.
Factors are converted to dummy matrices via "one-hot" encoding. Options deal
with missing values and sparsity.
makeX(train, test = NULL, na.impute = FALSE, sparse = FALSE, ...)
Required argument. A dataframe consisting of vectors, matrices and factors
Optional argument. A dataframe matching 'train' for use as testing data
Logical, default FALSE
. If TRUE
, missing
values for any column in the resultant 'x' matrix are replaced by the means
of the nonmissing values derived from 'train'
Logical, default FALSE
. If TRUE
then the
returned matrice(s) are converted to matrices of class "CsparseMatrix".
Useful if some factors have a large number of levels, resulting in very big
matrices, mostly zero
additional arguments, currently unused
If only 'train' was provided, the function returns a matrix 'x'. If missing values were imputed, this matrix has an attribute containing its column means (before imputation). If 'test' was provided as well, a list with two components is returned: 'x' and 'xtest'.
The main function is to convert factors to dummy matrices via "one-hot"
encoding. Having the 'train' and 'test' data present is useful if some
factor levels are missing in either. Since a factor with k levels leads to a
submatrix with 1/k entries zero, with large k the sparse=TRUE
option
can be helpful; a large matrix will be returned, but stored in sparse matrix
format. Finally, the function can deal with missing data. The current
version has the option to replace missing observations with the mean from
the training data. For dummy submatrices, these are the mean proportions at
each level.
glmnet
set.seed(101)
### Single data frame
X = matrix(rnorm(20), 10, 2)
X3 = sample(letters[1:3], 10, replace = TRUE)
X4 = sample(LETTERS[1:3], 10, replace = TRUE)
df = data.frame(X, X3, X4)
makeX(df)
#> X1 X2 X3a X3b X3c X4A X4B X4C
#> 1 -0.3260365 0.5264481 0 1 0 0 0 1
#> 2 0.5524619 -0.7948444 0 0 1 0 1 0
#> 3 -0.6749438 1.4277555 1 0 0 0 1 0
#> 4 0.2143595 -1.4668197 1 0 0 1 0 0
#> 5 0.3107692 -0.2366834 1 0 0 0 1 0
#> 6 1.1739663 -0.1933380 0 1 0 1 0 0
#> 7 0.6187899 -0.8497547 1 0 0 1 0 0
#> 8 -0.1127343 0.0584655 0 1 0 1 0 0
#> 9 0.9170283 -0.8176704 0 1 0 0 0 1
#> 10 -0.2232594 -2.0503078 0 0 1 0 0 1
makeX(df, sparse = TRUE)
#> 10 x 8 sparse Matrix of class "dgCMatrix"
#> X1 X2 X3a X3b X3c X4A X4B X4C
#> 1 -0.3260365 0.5264481 . 1 . . . 1
#> 2 0.5524619 -0.7948444 . . 1 . 1 .
#> 3 -0.6749438 1.4277555 1 . . . 1 .
#> 4 0.2143595 -1.4668197 1 . . 1 . .
#> 5 0.3107692 -0.2366834 1 . . . 1 .
#> 6 1.1739663 -0.1933380 . 1 . 1 . .
#> 7 0.6187899 -0.8497547 1 . . 1 . .
#> 8 -0.1127343 0.0584655 . 1 . 1 . .
#> 9 0.9170283 -0.8176704 . 1 . . . 1
#> 10 -0.2232594 -2.0503078 . . 1 . . 1
### Single data freame with missing values
Xn = X
Xn[3, 1] = NA
Xn[5, 2] = NA
X3n = X3
X3n[6] = NA
X4n = X4
X4n[9] = NA
dfn = data.frame(Xn, X3n, X4n)
makeX(dfn)
#> X1 X2 X3na X3nb X3nc X4nA X4nB X4nC
#> 1 -0.3260365 0.5264481 0 1 0 0 0 1
#> 2 0.5524619 -0.7948444 0 0 1 0 1 0
#> 3 NA 1.4277555 1 0 0 0 1 0
#> 4 0.2143595 -1.4668197 1 0 0 1 0 0
#> 5 0.3107692 NA 1 0 0 0 1 0
#> 6 1.1739663 -0.1933380 NA NA NA 1 0 0
#> 7 0.6187899 -0.8497547 1 0 0 1 0 0
#> 8 -0.1127343 0.0584655 0 1 0 1 0 0
#> 9 0.9170283 -0.8176704 0 1 0 NA NA NA
#> 10 -0.2232594 -2.0503078 0 0 1 0 0 1
makeX(dfn, sparse = TRUE)
#> 10 x 8 sparse Matrix of class "dgCMatrix"
#> X1 X2 X3na X3nb X3nc X4nA X4nB X4nC
#> 1 -0.3260365 0.5264481 . 1 . . . 1
#> 2 0.5524619 -0.7948444 . . 1 . 1 .
#> 3 NA 1.4277555 1 . . . 1 .
#> 4 0.2143595 -1.4668197 1 . . 1 . .
#> 5 0.3107692 NA 1 . . . 1 .
#> 6 1.1739663 -0.1933380 NA NA NA 1 . .
#> 7 0.6187899 -0.8497547 1 . . 1 . .
#> 8 -0.1127343 0.0584655 . 1 . 1 . .
#> 9 0.9170283 -0.8176704 . 1 . NA NA NA
#> 10 -0.2232594 -2.0503078 . . 1 . . 1
makeX(dfn, na.impute = TRUE)
#> X1 X2 X3na X3nb X3nc X4nA X4nB
#> 1 -0.3260365 0.5264481 0.0000000 1.0000000 0.0000000 0.0000000 0.0000000
#> 2 0.5524619 -0.7948444 0.0000000 0.0000000 1.0000000 0.0000000 1.0000000
#> 3 0.3472605 1.4277555 1.0000000 0.0000000 0.0000000 0.0000000 1.0000000
#> 4 0.2143595 -1.4668197 1.0000000 0.0000000 0.0000000 1.0000000 0.0000000
#> 5 0.3107692 -0.4622295 1.0000000 0.0000000 0.0000000 0.0000000 1.0000000
#> 6 1.1739663 -0.1933380 0.4444444 0.3333333 0.2222222 1.0000000 0.0000000
#> 7 0.6187899 -0.8497547 1.0000000 0.0000000 0.0000000 1.0000000 0.0000000
#> 8 -0.1127343 0.0584655 0.0000000 1.0000000 0.0000000 1.0000000 0.0000000
#> 9 0.9170283 -0.8176704 0.0000000 1.0000000 0.0000000 0.4444444 0.3333333
#> 10 -0.2232594 -2.0503078 0.0000000 0.0000000 1.0000000 0.0000000 0.0000000
#> X4nC
#> 1 1.0000000
#> 2 0.0000000
#> 3 0.0000000
#> 4 0.0000000
#> 5 0.0000000
#> 6 0.0000000
#> 7 0.0000000
#> 8 0.0000000
#> 9 0.2222222
#> 10 1.0000000
#> attr(,"means")
#> X1 X2 X3na X3nb X3nc X4nA X4nB
#> 0.3472605 -0.4622295 0.4444444 0.3333333 0.2222222 0.4444444 0.3333333
#> X4nC
#> 0.2222222
makeX(dfn, na.impute = TRUE, sparse = TRUE)
#> 10 x 8 sparse Matrix of class "dgCMatrix"
#> X1 X2 X3na X3nb X3nc X4nA X4nB
#> 1 -0.3260365 0.5264481 . 1.0000000 . . .
#> 2 0.5524619 -0.7948444 . . 1.0000000 . 1.0000000
#> 3 0.3472605 1.4277555 1.0000000 . . . 1.0000000
#> 4 0.2143595 -1.4668197 1.0000000 . . 1.0000000 .
#> 5 0.3107692 -0.4622295 1.0000000 . . . 1.0000000
#> 6 1.1739663 -0.1933380 0.4444444 0.3333333 0.2222222 1.0000000 .
#> 7 0.6187899 -0.8497547 1.0000000 . . 1.0000000 .
#> 8 -0.1127343 0.0584655 . 1.0000000 . 1.0000000 .
#> 9 0.9170283 -0.8176704 . 1.0000000 . 0.4444444 0.3333333
#> 10 -0.2232594 -2.0503078 . . 1.0000000 . .
#> X4nC
#> 1 1.0000000
#> 2 .
#> 3 .
#> 4 .
#> 5 .
#> 6 .
#> 7 .
#> 8 .
#> 9 0.2222222
#> 10 1.0000000
### Test data as well
X = matrix(rnorm(10), 5, 2)
X3 = sample(letters[1:3], 5, replace = TRUE)
X4 = sample(LETTERS[1:3], 5, replace = TRUE)
dft = data.frame(X, X3, X4)
makeX(df, dft)
#> $x
#> X1 X2 X3a X3b X3c X4A X4B X4C
#> 1 -0.3260365 0.5264481 0 1 0 0 0 1
#> 2 0.5524619 -0.7948444 0 0 1 0 1 0
#> 3 -0.6749438 1.4277555 1 0 0 0 1 0
#> 4 0.2143595 -1.4668197 1 0 0 1 0 0
#> 5 0.3107692 -0.2366834 1 0 0 0 1 0
#> 6 1.1739663 -0.1933380 0 1 0 1 0 0
#> 7 0.6187899 -0.8497547 1 0 0 1 0 0
#> 8 -0.1127343 0.0584655 0 1 0 1 0 0
#> 9 0.9170283 -0.8176704 0 1 0 0 0 1
#> 10 -0.2232594 -2.0503078 0 0 1 0 0 1
#>
#> $xtest
#> X1 X2 X3a X3b X3c X4A X4B X4C
#> 11 -0.5098443 -0.7556130 0 1 0 0 1 0
#> 12 1.5661805 1.7384118 0 0 1 0 1 0
#> 13 1.5273728 0.7580952 0 1 0 0 1 0
#> 14 1.0059925 2.1152294 1 0 0 1 0 0
#> 15 -0.5829222 1.6704604 0 1 0 0 1 0
#>
makeX(df, dft, sparse = TRUE)
#> $x
#> 10 x 8 sparse Matrix of class "dgCMatrix"
#> X1 X2 X3a X3b X3c X4A X4B X4C
#> 1 -0.3260365 0.5264481 . 1 . . . 1
#> 2 0.5524619 -0.7948444 . . 1 . 1 .
#> 3 -0.6749438 1.4277555 1 . . . 1 .
#> 4 0.2143595 -1.4668197 1 . . 1 . .
#> 5 0.3107692 -0.2366834 1 . . . 1 .
#> 6 1.1739663 -0.1933380 . 1 . 1 . .
#> 7 0.6187899 -0.8497547 1 . . 1 . .
#> 8 -0.1127343 0.0584655 . 1 . 1 . .
#> 9 0.9170283 -0.8176704 . 1 . . . 1
#> 10 -0.2232594 -2.0503078 . . 1 . . 1
#>
#> $xtest
#> 5 x 8 sparse Matrix of class "dgCMatrix"
#> X1 X2 X3a X3b X3c X4A X4B X4C
#> 11 -0.5098443 -0.7556130 . 1 . . 1 .
#> 12 1.5661805 1.7384118 . . 1 . 1 .
#> 13 1.5273728 0.7580952 . 1 . . 1 .
#> 14 1.0059925 2.1152294 1 . . 1 . .
#> 15 -0.5829222 1.6704604 . 1 . . 1 .
#>
### Missing data in test as well
Xn = X
Xn[3, 1] = NA
Xn[5, 2] = NA
X3n = X3
X3n[1] = NA
X4n = X4
X4n[2] = NA
dftn = data.frame(Xn, X3n, X4n)
makeX(dfn, dftn)
#> $x
#> X1 X2 X3na X3nb X3nc X4nA X4nB X4nC
#> 1 -0.3260365 0.5264481 0 1 0 0 0 1
#> 2 0.5524619 -0.7948444 0 0 1 0 1 0
#> 3 NA 1.4277555 1 0 0 0 1 0
#> 4 0.2143595 -1.4668197 1 0 0 1 0 0
#> 5 0.3107692 NA 1 0 0 0 1 0
#> 6 1.1739663 -0.1933380 NA NA NA 1 0 0
#> 7 0.6187899 -0.8497547 1 0 0 1 0 0
#> 8 -0.1127343 0.0584655 0 1 0 1 0 0
#> 9 0.9170283 -0.8176704 0 1 0 NA NA NA
#> 10 -0.2232594 -2.0503078 0 0 1 0 0 1
#>
#> $xtest
#> X1 X2 X3na X3nb X3nc X4nA X4nB X4nC
#> 11 -0.5098443 -0.7556130 NA NA NA 0 1 0
#> 12 1.5661805 1.7384118 0 0 1 NA NA NA
#> 13 NA 0.7580952 0 1 0 0 1 0
#> 14 1.0059925 2.1152294 1 0 0 1 0 0
#> 15 -0.5829222 NA 0 1 0 0 1 0
#>
makeX(dfn, dftn, sparse = TRUE)
#> $x
#> 10 x 8 sparse Matrix of class "dgCMatrix"
#> X1 X2 X3na X3nb X3nc X4nA X4nB X4nC
#> 1 -0.3260365 0.5264481 . 1 . . . 1
#> 2 0.5524619 -0.7948444 . . 1 . 1 .
#> 3 NA 1.4277555 1 . . . 1 .
#> 4 0.2143595 -1.4668197 1 . . 1 . .
#> 5 0.3107692 NA 1 . . . 1 .
#> 6 1.1739663 -0.1933380 NA NA NA 1 . .
#> 7 0.6187899 -0.8497547 1 . . 1 . .
#> 8 -0.1127343 0.0584655 . 1 . 1 . .
#> 9 0.9170283 -0.8176704 . 1 . NA NA NA
#> 10 -0.2232594 -2.0503078 . . 1 . . 1
#>
#> $xtest
#> 5 x 8 sparse Matrix of class "dgCMatrix"
#> X1 X2 X3na X3nb X3nc X4nA X4nB X4nC
#> 11 -0.5098443 -0.7556130 NA NA NA . 1 .
#> 12 1.5661805 1.7384118 . . 1 NA NA NA
#> 13 NA 0.7580952 . 1 . . 1 .
#> 14 1.0059925 2.1152294 1 . . 1 . .
#> 15 -0.5829222 NA . 1 . . 1 .
#>
makeX(dfn, dftn, na.impute = TRUE)
#> $x
#> X1 X2 X3na X3nb X3nc X4nA X4nB
#> 1 -0.3260365 0.5264481 0.0000000 1.0000000 0.0000000 0.0000000 0.0000000
#> 2 0.5524619 -0.7948444 0.0000000 0.0000000 1.0000000 0.0000000 1.0000000
#> 3 0.3472605 1.4277555 1.0000000 0.0000000 0.0000000 0.0000000 1.0000000
#> 4 0.2143595 -1.4668197 1.0000000 0.0000000 0.0000000 1.0000000 0.0000000
#> 5 0.3107692 -0.4622295 1.0000000 0.0000000 0.0000000 0.0000000 1.0000000
#> 6 1.1739663 -0.1933380 0.4444444 0.3333333 0.2222222 1.0000000 0.0000000
#> 7 0.6187899 -0.8497547 1.0000000 0.0000000 0.0000000 1.0000000 0.0000000
#> 8 -0.1127343 0.0584655 0.0000000 1.0000000 0.0000000 1.0000000 0.0000000
#> 9 0.9170283 -0.8176704 0.0000000 1.0000000 0.0000000 0.4444444 0.3333333
#> 10 -0.2232594 -2.0503078 0.0000000 0.0000000 1.0000000 0.0000000 0.0000000
#> X4nC
#> 1 1.0000000
#> 2 0.0000000
#> 3 0.0000000
#> 4 0.0000000
#> 5 0.0000000
#> 6 0.0000000
#> 7 0.0000000
#> 8 0.0000000
#> 9 0.2222222
#> 10 1.0000000
#> attr(,"means")
#> X1 X2 X3na X3nb X3nc X4nA X4nB
#> 0.3472605 -0.4622295 0.4444444 0.3333333 0.2222222 0.4444444 0.3333333
#> X4nC
#> 0.2222222
#>
#> $xtest
#> X1 X2 X3na X3nb X3nc X4nA X4nB
#> 11 -0.5098443 -0.7556130 0.4444444 0.3333333 0.2222222 0.0000000 1.0000000
#> 12 1.5661805 1.7384118 0.0000000 0.0000000 1.0000000 0.4444444 0.3333333
#> 13 0.3472605 0.7580952 0.0000000 1.0000000 0.0000000 0.0000000 1.0000000
#> 14 1.0059925 2.1152294 1.0000000 0.0000000 0.0000000 1.0000000 0.0000000
#> 15 -0.5829222 -0.4622295 0.0000000 1.0000000 0.0000000 0.0000000 1.0000000
#> X4nC
#> 11 0.0000000
#> 12 0.2222222
#> 13 0.0000000
#> 14 0.0000000
#> 15 0.0000000
#>
makeX(dfn, dftn, sparse = TRUE, na.impute = TRUE)
#> $x
#> 10 x 8 sparse Matrix of class "dgCMatrix"
#> X1 X2 X3na X3nb X3nc X4nA X4nB
#> 1 -0.3260365 0.5264481 . 1.0000000 . . .
#> 2 0.5524619 -0.7948444 . . 1.0000000 . 1.0000000
#> 3 0.3472605 1.4277555 1.0000000 . . . 1.0000000
#> 4 0.2143595 -1.4668197 1.0000000 . . 1.0000000 .
#> 5 0.3107692 -0.4622295 1.0000000 . . . 1.0000000
#> 6 1.1739663 -0.1933380 0.4444444 0.3333333 0.2222222 1.0000000 .
#> 7 0.6187899 -0.8497547 1.0000000 . . 1.0000000 .
#> 8 -0.1127343 0.0584655 . 1.0000000 . 1.0000000 .
#> 9 0.9170283 -0.8176704 . 1.0000000 . 0.4444444 0.3333333
#> 10 -0.2232594 -2.0503078 . . 1.0000000 . .
#> X4nC
#> 1 1.0000000
#> 2 .
#> 3 .
#> 4 .
#> 5 .
#> 6 .
#> 7 .
#> 8 .
#> 9 0.2222222
#> 10 1.0000000
#>
#> $xtest
#> 5 x 8 sparse Matrix of class "dgCMatrix"
#> X1 X2 X3na X3nb X3nc X4nA X4nB
#> 11 -0.5098443 -0.7556130 0.4444444 0.3333333 0.2222222 . 1.0000000
#> 12 1.5661805 1.7384118 . . 1.0000000 0.4444444 0.3333333
#> 13 0.3472605 0.7580952 . 1.0000000 . . 1.0000000
#> 14 1.0059925 2.1152294 1.0000000 . . 1.0000000 .
#> 15 -0.5829222 -0.4622295 . 1.0000000 . . 1.0000000
#> X4nC
#> 11 .
#> 12 0.2222222
#> 13 .
#> 14 .
#> 15 .
#>