Merge pull request #136 from kgoldfeld/fix-issue-135

kgoldfeld · web-flow · commit e7efa4c0439d · 2022-01-15T12:47:56.000-05:00
Fixing rounding bug in .gencat
diff --git a/R/generate_dist.R b/R/generate_dist.R
@@ -44,7 +44,7 @@
       formula = args$formula,
       variance = args$variance,
       link = args$link,
-      dfSim = copy(dfSim),
+      dtSim = copy(dfSim),
       envir = envir
     ),
     exponential = .genexp(
@@ -270,7 +270,7 @@
 # @param envir Environment the data definitions are evaluated in.
 #  Defaults to [base::parent.frame].
 # @return A data.frame column with the updated simulated data
-.gencat <- function(n, formula, variance, link, dfSim, envir) {
+.gencat <- function(n, formula, variance, link, dtSim, envir) {
   formulas <- .splitFormula(formula)
 
   if (length(formulas) < 2) {
@@ -281,7 +281,7 @@
   }
 
   parsedProbs <-
-    .evalWith(formulas, .parseDotVars(formulas, envir), dfSim, n)
+    .evalWith(formulas, .parseDotVars(formulas, envir), dtSim, n)
 
   if (link == "logit") {
     parsedProbs <- exp(parsedProbs)
@@ -291,6 +291,7 @@
   }
 
   parsedProbs <- cbind(parsedProbs, 1 - rowSums(parsedProbs))
+  parsedProbs <- round(parsedProbs, 12) # to avoid extremely small p's
 
   c <- .Call(`_simstudy_matMultinom`, parsedProbs, PACKAGE = "simstudy")
 
diff --git a/man/genOrdCat.Rd b/man/genOrdCat.Rd
diff --git a/vignettes/simstudy.Rmd b/vignettes/simstudy.Rmd
@@ -217,7 +217,7 @@ A *binomial* distribution is a discrete data distribution that represents the co
 
 #### categorical
 
-A *categorical* distribution is a discrete data distribution taking on values from $1$ to $K$, with each value representing a specific category, and there are $K$ categories. The categories may or may not be ordered. For a categorical variable with $k$ categories, the `formula` is a string of probabilities that sum to 1, each separated by a semi-colon: $(p_1 ; p_2 ; ... ; p_k)$. $p_1$ is the probability of the random variable falling in category $1$, $p_2$ is the probability of category $2$, etc. The probabilities can be specified as functions of other variables previously defined. The helper function `genCatFormula` is an easy way to create different probability strings. The `link` options are *identity* or *logit*. The `variance` field is optional an allows to provide categories other than the default `1...n` in the same format as `formula`: "a;b;c". Numeric variance Strings (e.g.  "50;100;200") will be converted to numeric when possible. 
+A *categorical* distribution is a discrete data distribution taking on values from $1$ to $K$, with each value representing a specific category, and there are $K$ categories. The categories may or may not be ordered. For a categorical variable with $k$ categories, the `formula` is a string of probabilities that sum to 1, each separated by a semi-colon: $(p_1 ; p_2 ; ... ; p_k)$. $p_1$ is the probability of the random variable falling in category $1$, $p_2$ is the probability of category $2$, etc. The probabilities can be specified as functions of other variables previously defined. The helper function `genCatFormula` is an easy way to create different probability strings. The `link` options are *identity* or *logit*. The `variance` field is optional an allows to provide categories other than the default `1...n` in the same format as `formula`: "a;b;c". Numeric variance Strings (e.g.  "50;100;200") will be converted to numeric when possible. All probabilities will be rounded to 1e12 decimal points to prevent possible rounding errors.
 
 #### exponential