#Introduction
The data used in the following examples comes from the heart disease dataset found at the UCI Machine Learning Repository.
#Load packages
require(tidyverse); require(cheese)
## Loading required package: tidyverse
## ── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## Loading required package: cheese
#Look at the top ten rows
heart_disease
## # A tibble: 303 x 9
## Age Sex ChestPain BP Cholesterol BloodSugar MaximumHR
## <dbl> <fct> <fct> <dbl> <dbl> <lgl> <dbl>
## 1 63 Male Typical … 145 233 TRUE 150
## 2 67 Male Asymptom… 160 286 FALSE 108
## 3 67 Male Asymptom… 120 229 FALSE 129
## 4 37 Male Non-angi… 130 250 FALSE 187
## 5 41 Fema… Atypical… 130 204 FALSE 172
## 6 56 Male Atypical… 120 236 FALSE 178
## 7 62 Fema… Asymptom… 140 268 FALSE 160
## 8 57 Fema… Asymptom… 120 354 FALSE 163
## 9 63 Male Asymptom… 130 254 FALSE 147
## 10 53 Male Asymptom… 140 203 TRUE 155
## # ... with 293 more rows, and 2 more variables:
## # ExerciseInducedAngina <fct>, HeartDisease <fct>
#Creating a univariate table
The function univariate_table
allows flexible summarization and presentation of variables in a dataset. Arguments are available to customize the statistics that are computed, association metrics, stratification variables, variable labels, etc. The format
argument allows the user to render any table in “html”, “latex”, “markdown”, “pandoc”, “none” (i.e. return a data.frame
). The following examples are rendered in “html” (default):
##Default
By default, the median (iqr), count (%), and the number of distinct values are displayed for numeric, categorical, and 'other' data types,
#Default table
heart_disease %>%
univariate_table
Variable |
Level |
Summary |
Age |
|
56 (13) |
Sex |
Female |
97 (32.01%) |
Male |
206 (67.99%) |
ChestPain |
Typical angina |
23 (7.59%) |
Atypical angina |
50 (16.5%) |
Non-anginal pain |
86 (28.38%) |
Asymptomatic |
144 (47.52%) |
BP |
|
130 (20) |
Cholesterol |
|
241 (64) |
BloodSugar |
|
2 |
MaximumHR |
|
153 (32.5) |
ExerciseInducedAngina |
No |
204 (67.33%) |
Yes |
99 (32.67%) |
HeartDisease |
No |
164 (54.13%) |
Yes |
139 (45.87%) |
##Stratification variables
Any number of stratification variables can be added to the table, either to the rows and/or columns with an intuitive formula
interface, where the left-hand side are row strata, and the right-hand side are column strata.. The argument add_n
can be set to TRUE
if row and/or column strata are present, in which case the sample size of that strata will be displayed.
#Single column strata
heart_disease %>%
univariate_table(
strata = ~HeartDisease
)
Variable |
Level |
No |
Yes |
Age |
|
52 (14.25) |
58 (10) |
Sex |
Female |
72 (43.9%) |
25 (17.99%) |
Male |
92 (56.1%) |
114 (82.01%) |
ChestPain |
Typical angina |
16 (9.76%) |
7 (5.04%) |
Atypical angina |
41 (25%) |
9 (6.47%) |
Non-anginal pain |
68 (41.46%) |
18 (12.95%) |
Asymptomatic |
39 (23.78%) |
105 (75.54%) |
BP |
|
130 (20) |
130 (25) |
Cholesterol |
|
234.5 (58.5) |
249 (66) |
BloodSugar |
|
2 |
2 |
MaximumHR |
|
161 (23.25) |
142 (31.5) |
ExerciseInducedAngina |
No |
141 (85.98%) |
63 (45.32%) |
Yes |
23 (14.02%) |
76 (54.68%) |
heart_disease %>%
univariate_table(
strata = ~HeartDisease,
add_n = TRUE
)
Variable |
Level |
No (N=164) |
Yes (N=139) |
Age |
|
52 (14.25) |
58 (10) |
Sex |
Female |
72 (43.9%) |
25 (17.99%) |
Male |
92 (56.1%) |
114 (82.01%) |
ChestPain |
Typical angina |
16 (9.76%) |
7 (5.04%) |
Atypical angina |
41 (25%) |
9 (6.47%) |
Non-anginal pain |
68 (41.46%) |
18 (12.95%) |
Asymptomatic |
39 (23.78%) |
105 (75.54%) |
BP |
|
130 (20) |
130 (25) |
Cholesterol |
|
234.5 (58.5) |
249 (66) |
BloodSugar |
|
2 |
2 |
MaximumHR |
|
161 (23.25) |
142 (31.5) |
ExerciseInducedAngina |
No |
141 (85.98%) |
63 (45.32%) |
Yes |
23 (14.02%) |
76 (54.68%) |
#Multiple column strata
heart_disease %>%
univariate_table(
strata = ~Sex + HeartDisease
)
Variable |
Level |
Female/No |
Male/No |
Female/Yes |
Male/Yes |
Age |
|
54 (17.25) |
52 (13) |
60 (5) |
57.5 (10) |
ChestPain |
Typical angina |
4 (5.56%) |
12 (13.04%) |
0 (0%) |
7 (6.14%) |
Atypical angina |
16 (22.22%) |
25 (27.17%) |
2 (8%) |
7 (6.14%) |
Non-anginal pain |
34 (47.22%) |
34 (36.96%) |
1 (4%) |
17 (14.91%) |
Asymptomatic |
18 (25%) |
21 (22.83%) |
22 (88%) |
83 (72.81%) |
BP |
|
130 (20.5) |
130 (20) |
140 (28) |
130 (20) |
Cholesterol |
|
249 (78.75) |
229.5 (44.25) |
268 (71) |
247.5 (70) |
BloodSugar |
|
2 |
2 |
2 |
2 |
MaximumHR |
|
159 (20.5) |
163 (25.75) |
146 (24) |
141 (31) |
ExerciseInducedAngina |
No |
64 (88.89%) |
77 (83.7%) |
11 (44%) |
52 (45.61%) |
Yes |
8 (11.11%) |
15 (16.3%) |
14 (56%) |
62 (54.39%) |
heart_disease %>%
univariate_table(
strata = ~Sex + HeartDisease,
add_n = TRUE,
strata_sep = "|"
)
Variable |
Level |
Female|No (N=72) |
Male|No (N=92) |
Female|Yes (N=25) |
Male|Yes (N=114) |
Age |
|
54 (17.25) |
52 (13) |
60 (5) |
57.5 (10) |
ChestPain |
Typical angina |
4 (5.56%) |
12 (13.04%) |
0 (0%) |
7 (6.14%) |
Atypical angina |
16 (22.22%) |
25 (27.17%) |
2 (8%) |
7 (6.14%) |
Non-anginal pain |
34 (47.22%) |
34 (36.96%) |
1 (4%) |
17 (14.91%) |
Asymptomatic |
18 (25%) |
21 (22.83%) |
22 (88%) |
83 (72.81%) |
BP |
|
130 (20.5) |
130 (20) |
140 (28) |
130 (20) |
Cholesterol |
|
249 (78.75) |
229.5 (44.25) |
268 (71) |
247.5 (70) |
BloodSugar |
|
2 |
2 |
2 |
2 |
MaximumHR |
|
159 (20.5) |
163 (25.75) |
146 (24) |
141 (31) |
ExerciseInducedAngina |
No |
64 (88.89%) |
77 (83.7%) |
11 (44%) |
52 (45.61%) |
Yes |
8 (11.11%) |
15 (16.3%) |
14 (56%) |
62 (54.39%) |
#Single row strata
heart_disease %>%
univariate_table(
strata = Sex~1
)
Variable |
Level |
Summary |
Female |
Age |
|
57 (13) |
ChestPain |
Typical angina |
4 (4.12%) |
Atypical angina |
18 (18.56%) |
Non-anginal pain |
35 (36.08%) |
Asymptomatic |
40 (41.24%) |
BP |
|
132 (20) |
Cholesterol |
|
254 (87) |
BloodSugar |
|
2 |
MaximumHR |
|
157 (23) |
ExerciseInducedAngina |
No |
75 (77.32%) |
Yes |
22 (22.68%) |
HeartDisease |
No |
72 (74.23%) |
Yes |
25 (25.77%) |
Male |
Age |
|
54.5 (12.75) |
ChestPain |
Typical angina |
19 (9.22%) |
Atypical angina |
32 (15.53%) |
Non-anginal pain |
51 (24.76%) |
Asymptomatic |
104 (50.49%) |
BP |
|
130 (20) |
Cholesterol |
|
235 (59.75) |
BloodSugar |
|
2 |
MaximumHR |
|
150.5 (35.5) |
ExerciseInducedAngina |
No |
129 (62.62%) |
Yes |
77 (37.38%) |
HeartDisease |
No |
92 (44.66%) |
Yes |
114 (55.34%) |
heart_disease %>%
univariate_table(
strata = Sex~1,
add_n = TRUE
)
Variable |
Level |
Summary |
Female (N=97) |
Age |
|
57 (13) |
ChestPain |
Typical angina |
4 (4.12%) |
Atypical angina |
18 (18.56%) |
Non-anginal pain |
35 (36.08%) |
Asymptomatic |
40 (41.24%) |
BP |
|
132 (20) |
Cholesterol |
|
254 (87) |
BloodSugar |
|
2 |
MaximumHR |
|
157 (23) |
ExerciseInducedAngina |
No |
75 (77.32%) |
Yes |
22 (22.68%) |
HeartDisease |
No |
72 (74.23%) |
Yes |
25 (25.77%) |
Male (N=206) |
Age |
|
54.5 (12.75) |
ChestPain |
Typical angina |
19 (9.22%) |
Atypical angina |
32 (15.53%) |
Non-anginal pain |
51 (24.76%) |
Asymptomatic |
104 (50.49%) |
BP |
|
130 (20) |
Cholesterol |
|
235 (59.75) |
BloodSugar |
|
2 |
MaximumHR |
|
150.5 (35.5) |
ExerciseInducedAngina |
No |
129 (62.62%) |
Yes |
77 (37.38%) |
HeartDisease |
No |
92 (44.66%) |
Yes |
114 (55.34%) |
#Column and row strata
heart_disease %>%
univariate_table(
strata = Sex~HeartDisease
)
Variable |
Level |
No |
Yes |
Female |
Age |
|
54 (17.25) |
60 (5) |
ChestPain |
Typical angina |
4 (5.56%) |
0 (0%) |
Atypical angina |
16 (22.22%) |
2 (8%) |
Non-anginal pain |
34 (47.22%) |
1 (4%) |
Asymptomatic |
18 (25%) |
22 (88%) |
BP |
|
130 (20.5) |
140 (28) |
Cholesterol |
|
249 (78.75) |
268 (71) |
BloodSugar |
|
2 |
2 |
MaximumHR |
|
159 (20.5) |
146 (24) |
ExerciseInducedAngina |
No |
64 (88.89%) |
11 (44%) |
Yes |
8 (11.11%) |
14 (56%) |
Male |
Age |
|
52 (13) |
57.5 (10) |
ChestPain |
Typical angina |
12 (13.04%) |
7 (6.14%) |
Atypical angina |
25 (27.17%) |
7 (6.14%) |
Non-anginal pain |
34 (36.96%) |
17 (14.91%) |
Asymptomatic |
21 (22.83%) |
83 (72.81%) |
BP |
|
130 (20) |
130 (20) |
Cholesterol |
|
229.5 (44.25) |
247.5 (70) |
BloodSugar |
|
2 |
2 |
MaximumHR |
|
163 (25.75) |
141 (31) |
ExerciseInducedAngina |
No |
77 (83.7%) |
52 (45.61%) |
Yes |
15 (16.3%) |
62 (54.39%) |
heart_disease %>%
univariate_table(
strata = Sex~HeartDisease,
add_n = TRUE
)
Variable |
Level |
No (N=164) |
Yes (N=139) |
Female (N=97) |
Age |
|
54 (17.25) |
60 (5) |
ChestPain |
Typical angina |
4 (5.56%) |
0 (0%) |
Atypical angina |
16 (22.22%) |
2 (8%) |
Non-anginal pain |
34 (47.22%) |
1 (4%) |
Asymptomatic |
18 (25%) |
22 (88%) |
BP |
|
130 (20.5) |
140 (28) |
Cholesterol |
|
249 (78.75) |
268 (71) |
BloodSugar |
|
2 |
2 |
MaximumHR |
|
159 (20.5) |
146 (24) |
ExerciseInducedAngina |
No |
64 (88.89%) |
11 (44%) |
Yes |
8 (11.11%) |
14 (56%) |
Male (N=206) |
Age |
|
52 (13) |
57.5 (10) |
ChestPain |
Typical angina |
12 (13.04%) |
7 (6.14%) |
Atypical angina |
25 (27.17%) |
7 (6.14%) |
Non-anginal pain |
34 (36.96%) |
17 (14.91%) |
Asymptomatic |
21 (22.83%) |
83 (72.81%) |
BP |
|
130 (20) |
130 (20) |
Cholesterol |
|
229.5 (44.25) |
247.5 (70) |
BloodSugar |
|
2 |
2 |
MaximumHR |
|
163 (25.75) |
141 (31) |
ExerciseInducedAngina |
No |
77 (83.7%) |
52 (45.61%) |
Yes |
15 (16.3%) |
62 (54.39%) |
##Association statistics
A typical practice is to add association metrics to a summary table, such as p-values, intended to evaluate the strength of relationship between a stratification column and the variables. The argument associations
takes a list
of functions, evaluates them for each variable/strata combination, and places the result in the table. Metrics are only computed if column strata are present. If there are also row strata present, the metrics will be computed within each group.
#Define functions to add to table
pvalues <-
function(
y, #Will be the strata variable
x #Each other variable
) {
#Different tests depending on type
if(is(x, "numeric")) {
wilcox.test(x~y)$p.value
} else {
fisher.test(table(x, y))$p.value
}
}
#Supply function to table
heart_disease %>%
univariate_table(
strata = ~HeartDisease,
add_n = TRUE,
associations = pvalues
)
Variable |
Level |
No (N=164) |
Yes (N=139) |
1 |
Age |
|
52 (14.25) |
58 (10) |
3.91727270669452e-05 |
Sex |
Female |
72 (43.9%) |
25 (17.99%) |
1.25894390015198e-06 |
Male |
92 (56.1%) |
114 (82.01%) |
|
ChestPain |
Typical angina |
16 (9.76%) |
7 (5.04%) |
1.08625481176028e-18 |
Atypical angina |
41 (25%) |
9 (6.47%) |
|
Non-anginal pain |
68 (41.46%) |
18 (12.95%) |
|
Asymptomatic |
39 (23.78%) |
105 (75.54%) |
|
BP |
|
130 (20) |
130 (25) |
0.0259721071731445 |
Cholesterol |
|
234.5 (58.5) |
249 (66) |
0.0353592013733774 |
BloodSugar |
|
2 |
2 |
0.746257200983864 |
MaximumHR |
|
161 (23.25) |
142 (31.5) |
1.86101461129144e-13 |
ExerciseInducedAngina |
No |
141 (85.98%) |
63 (45.32%) |
4.02917658499919e-14 |
Yes |
23 (14.02%) |
76 (54.68%) |
|
#Make a named list to name the column
metrics <- list(`P-value` = pvalues)
heart_disease %>%
univariate_table(
strata = ~HeartDisease,
add_n = TRUE,
associations = metrics
)
Variable |
Level |
No (N=164) |
Yes (N=139) |
P-value |
Age |
|
52 (14.25) |
58 (10) |
3.91727270669452e-05 |
Sex |
Female |
72 (43.9%) |
25 (17.99%) |
1.25894390015198e-06 |
Male |
92 (56.1%) |
114 (82.01%) |
|
ChestPain |
Typical angina |
16 (9.76%) |
7 (5.04%) |
1.08625481176028e-18 |
Atypical angina |
41 (25%) |
9 (6.47%) |
|
Non-anginal pain |
68 (41.46%) |
18 (12.95%) |
|
Asymptomatic |
39 (23.78%) |
105 (75.54%) |
|
BP |
|
130 (20) |
130 (25) |
0.0259721071731445 |
Cholesterol |
|
234.5 (58.5) |
249 (66) |
0.0353592013733774 |
BloodSugar |
|
2 |
2 |
0.746257200983864 |
MaximumHR |
|
161 (23.25) |
142 (31.5) |
1.86101461129144e-13 |
ExerciseInducedAngina |
No |
141 (85.98%) |
63 (45.32%) |
4.02917658499919e-14 |
Yes |
23 (14.02%) |
76 (54.68%) |
|
#Add additional function to list that computes the AIC of a logistic regression model
metrics$AIC <-
function(y, x) AIC(glm(factor(y)~x, family = "binomial"))
heart_disease %>%
univariate_table(
strata = ~HeartDisease,
add_n = TRUE,
associations = metrics
)
Variable |
Level |
No (N=164) |
Yes (N=139) |
P-value |
AIC |
Age |
|
52 (14.25) |
58 (10) |
3.91727270669452e-05 |
406.53555135163 |
Sex |
Female |
72 (43.9%) |
25 (17.99%) |
1.25894390015198e-06 |
397.932854578273 |
Male |
92 (56.1%) |
114 (82.01%) |
|
|
ChestPain |
Typical angina |
16 (9.76%) |
7 (5.04%) |
1.08625481176028e-18 |
339.86455352939 |
Atypical angina |
41 (25%) |
9 (6.47%) |
|
|
Non-anginal pain |
68 (41.46%) |
18 (12.95%) |
|
|
Asymptomatic |
39 (23.78%) |
105 (75.54%) |
|
|
BP |
|
130 (20) |
130 (25) |
0.0259721071731445 |
415.027967902517 |
Cholesterol |
|
234.5 (58.5) |
249 (66) |
0.0353592013733774 |
419.776573596341 |
BloodSugar |
|
2 |
2 |
0.746257200983864 |
421.789178189058 |
MaximumHR |
|
161 (23.25) |
142 (31.5) |
1.86101461129144e-13 |
364.902105031567 |
ExerciseInducedAngina |
No |
141 (85.98%) |
63 (45.32%) |
4.02917658499919e-14 |
363.537221546849 |
Yes |
23 (14.02%) |
76 (54.68%) |
|
|
#Compute metrics across salary within sex
heart_disease %>%
univariate_table(
strata = Sex~HeartDisease,
add_n = TRUE,
associations = metrics
)
Variable |
Level |
No (N=164) |
Yes (N=139) |
P-value |
AIC |
Female (N=97) |
Age |
|
54 (17.25) |
60 (5) |
0.0398442516740459 |
110.131195630795 |
ChestPain |
Typical angina |
4 (5.56%) |
0 (0%) |
3.06061547795029e-07 |
84.6909093658332 |
Atypical angina |
16 (22.22%) |
2 (8%) |
|
|
Non-anginal pain |
34 (47.22%) |
1 (4%) |
|
|
Asymptomatic |
18 (25%) |
22 (88%) |
|
|
BP |
|
130 (20.5) |
140 (28) |
0.000395706006314433 |
98.4612833810574 |
Cholesterol |
|
249 (78.75) |
268 (71) |
0.103293588251319 |
113.103340772899 |
BloodSugar |
|
2 |
2 |
0.0715461243557009 |
110.96313878098 |
MaximumHR |
|
159 (20.5) |
146 (24) |
0.012407587301334 |
109.340618580244 |
ExerciseInducedAngina |
No |
64 (88.89%) |
11 (44%) |
1.69588923622621e-05 |
95.3736844105057 |
Yes |
8 (11.11%) |
14 (56%) |
|
|
Male (N=206) |
Age |
|
52 (13) |
57.5 (10) |
2.35520048183124e-05 |
270.025672435557 |
ChestPain |
Typical angina |
12 (13.04%) |
7 (6.14%) |
4.84907141302052e-12 |
236.189012884494 |
Atypical angina |
25 (27.17%) |
7 (6.14%) |
|
|
Non-anginal pain |
34 (36.96%) |
17 (14.91%) |
|
|
Asymptomatic |
21 (22.83%) |
83 (72.81%) |
|
|
BP |
|
130 (20) |
130 (20) |
0.431611379617407 |
286.264205286785 |
Cholesterol |
|
229.5 (44.25) |
247.5 (70) |
0.0124440883774021 |
281.249277382165 |
BloodSugar |
|
2 |
2 |
0.446533653014522 |
286.479455093556 |
MaximumHR |
|
163 (25.75) |
141 (31) |
1.61928796484918e-12 |
231.906181238564 |
ExerciseInducedAngina |
No |
77 (83.7%) |
52 (45.61%) |
1.20005825650629e-08 |
253.896083814185 |
Yes |
15 (16.3%) |
62 (54.39%) |
|
|
##Custom string templates for summary statistics
It may be of interest to present summary statistics other than the default, and in a different format. The numeric_summary
, categorical_summary
, and other_summary
arguments take character vectors which allow any number of summaries to be added to the table in any format. Values are simply requested verbatim by name (i.e. “median (iqr) | mean (sd)” will provide a column in the table where the median, iqr, mean, and standard deviation are populatd with the result).
#Add summary columns for numeric data
heart_disease %>%
univariate_table(
numeric_summary = c(Median = "median", Mean = "mean")
)
Variable |
Level |
Median |
Mean |
Summary |
Age |
|
56 |
54.44 |
|
Sex |
Female |
|
|
97 (32.01%) |
Male |
|
|
206 (67.99%) |
ChestPain |
Typical angina |
|
|
23 (7.59%) |
Atypical angina |
|
|
50 (16.5%) |
Non-anginal pain |
|
|
86 (28.38%) |
Asymptomatic |
|
|
144 (47.52%) |
BP |
|
130 |
131.69 |
|
Cholesterol |
|
241 |
246.69 |
|
BloodSugar |
|
|
|
2 |
MaximumHR |
|
153 |
149.61 |
|
ExerciseInducedAngina |
No |
|
|
204 (67.33%) |
Yes |
|
|
99 (32.67%) |
HeartDisease |
No |
|
|
164 (54.13%) |
Yes |
|
|
139 (45.87%) |
#Add a stratification variable
heart_disease %>%
univariate_table(
numeric_summary = c(Median = "median", Mean = "mean"),
strata = ~HeartDisease
)
|
No |
Yes |
Variable |
Level |
Median |
Mean |
Summary |
Median |
Mean |
Summary |
Age |
|
52 |
52.59 |
|
58 |
56.63 |
|
Sex |
Female |
|
|
72 (43.9%) |
|
|
25 (17.99%) |
Male |
|
|
92 (56.1%) |
|
|
114 (82.01%) |
ChestPain |
Typical angina |
|
|
16 (9.76%) |
|
|
7 (5.04%) |
Atypical angina |
|
|
41 (25%) |
|
|
9 (6.47%) |
Non-anginal pain |
|
|
68 (41.46%) |
|
|
18 (12.95%) |
Asymptomatic |
|
|
39 (23.78%) |
|
|
105 (75.54%) |
BP |
|
130 |
129.25 |
|
130 |
134.57 |
|
Cholesterol |
|
234.5 |
242.64 |
|
249 |
251.47 |
|
BloodSugar |
|
|
|
2 |
|
|
2 |
MaximumHR |
|
161 |
158.38 |
|
142 |
139.26 |
|
ExerciseInducedAngina |
No |
|
|
141 (85.98%) |
|
|
63 (45.32%) |
Yes |
|
|
23 (14.02%) |
|
|
76 (54.68%) |
The following strings are available by default:
Numeric types
- “median”
- “mean”
- “min”
- “max”
- “iqr”
- “sd”
Categorical types
All variables
- “length”
- “missing”
- “available”
- “unique”
These can be placed in a string template in any format, and will be replaced with the actual value when the function is called.
##Miscellaneous features
Numerous other features are available to further customize the table:
- Provide clean labels to variable names and factor levels
- Enter custom summary statistics to compute on different types of data
- Choose which data types become evaluated with functions for numeric and categorical data
- Evaluate the string template as an
R
expression once populated with the result
- And more…
See ?univariate_table
for details.
#Core functions
The functions used to implement different pieces of the specialized functions above were intentionally written to be generalizable and useful in other contexts.
-
divide
: Stratify a dataset into a list by 1 or more variables
heart_disease %>%
divide(
by = "Sex"
)
## $Female
## # A tibble: 97 x 8
## Age ChestPain BP Cholesterol BloodSugar MaximumHR ExerciseInduced…
## <dbl> <fct> <dbl> <dbl> <lgl> <dbl> <fct>
## 1 41 Atypical… 130 204 FALSE 172 No
## 2 62 Asymptom… 140 268 FALSE 160 No
## 3 57 Asymptom… 120 354 FALSE 163 Yes
## 4 56 Atypical… 140 294 FALSE 153 No
## 5 48 Non-angi… 130 275 FALSE 139 No
## 6 58 Typical … 150 283 TRUE 162 No
## 7 50 Non-angi… 120 219 FALSE 158 No
## 8 58 Non-angi… 120 340 FALSE 172 No
## 9 66 Typical … 150 226 FALSE 114 No
## 10 69 Typical … 140 239 FALSE 151 No
## # ... with 87 more rows, and 1 more variable: HeartDisease <fct>
##
## $Male
## # A tibble: 206 x 8
## Age ChestPain BP Cholesterol BloodSugar MaximumHR ExerciseInduced…
## <dbl> <fct> <dbl> <dbl> <lgl> <dbl> <fct>
## 1 63 Typical … 145 233 TRUE 150 No
## 2 67 Asymptom… 160 286 FALSE 108 Yes
## 3 67 Asymptom… 120 229 FALSE 129 Yes
## 4 37 Non-angi… 130 250 FALSE 187 No
## 5 56 Atypical… 120 236 FALSE 178 No
## 6 63 Asymptom… 130 254 FALSE 147 No
## 7 53 Asymptom… 140 203 TRUE 155 Yes
## 8 57 Asymptom… 140 192 FALSE 148 No
## 9 56 Non-angi… 130 256 TRUE 142 Yes
## 10 44 Atypical… 120 263 FALSE 173 No
## # ... with 196 more rows, and 1 more variable: HeartDisease <fct>
-
stratiply
: Apply a function to a data frame by one or more variables and easily gather results
heart_disease %>%
stratiply(
strata = c("Sex", "HeartDisease"),
f = function(x)
x %>%
select_if(is.numeric) %>%
map(mean, na.rm = TRUE),
bind = TRUE,
separate = TRUE
)
## # A tibble: 4 x 6
## Sex HeartDisease Age BP Cholesterol MaximumHR
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Female No 54.6 129. 257. 154.
## 2 Male No 51.0 130. 232. 162.
## 3 Female Yes 59.1 147. 276. 143.
## 4 Male Yes 56.1 132. 246. 138.
-
stretch
: Spread any number of values across the columns by any number of keys
#Create a frame of summaries
temp_summary <-
heart_disease %>%
group_by(
Sex,
HeartDisease,
BloodSugar
) %>%
summarise(
Mean = mean(Age, na.rm = TRUE),
SD = sd(Age, na.rm = TRUE),
Median = median(Age, na.rm = TRUE)
) %>%
ungroup()
#Span summaries for each combination of Sex and BloodSugar
temp_summary %>%
stretch(
keys = c("Sex", "BloodSugar"),
keep = "HeartDisease"
)
## # A tibble: 2 x 13
## HeartDisease Mean_Female_FAL… SD_Female_FALSE Median_Female_F…
## <fct> <dbl> <dbl> <dbl>
## 1 No 54.0 10.4 53.5
## 2 Yes 59.7 3.61 61
## # ... with 9 more variables: Mean_Male_FALSE <dbl>, SD_Male_FALSE <dbl>,
## # Median_Male_FALSE <dbl>, Mean_Female_TRUE <dbl>, SD_Female_TRUE <dbl>,
## # Median_Female_TRUE <dbl>, Mean_Male_TRUE <dbl>, SD_Male_TRUE <dbl>,
## # Median_Male_TRUE <dbl>
#Clean HTML table with keys spanned over columns
result <-
temp_summary %>%
stretch(
keys = c("Sex", "BloodSugar"),
keep = "HeartDisease",
extract_keys_as_header = TRUE,
keep_keys_in_header = FALSE
)
result$.result %>%
knitr::kable(format = "html") %>%
kableExtra::kable_styling() %>%
kableExtra::add_header_above(
kableExtra::auto_index(result$.header)
)
|
Female_FALSE |
Male_FALSE |
Female_TRUE |
Male_TRUE |
HeartDisease |
Mean |
SD |
Median |
Mean |
SD |
Median |
Mean |
SD |
Median |
Mean |
SD |
Median |
No |
54.03030 |
10.408532 |
53.5 |
50.21333 |
8.819961 |
51.0 |
60.33333 |
6.653320 |
59 |
54.70588 |
6.761700 |
53 |
Yes |
59.68421 |
3.606362 |
61.0 |
55.75510 |
8.560351 |
57.5 |
57.16667 |
7.808115 |
58 |
58.12500 |
7.116881 |
58 |
-
dish
: Evaluate a two-argument function on subsets of a data frame by evaluating each combination of columns or subsets
heart_disease %>%
dish(
f =
function(y, x) {
mod <- lm(y ~ x)
tibble(
Parameter = names(mod$coef),
Estimate = mod$coef
)
},
left = c("Age", "BP"),
bind = TRUE
)
## # A tibble: 32 x 4
## .left .right Parameter Estimate
## <chr> <chr> <chr> <dbl>
## 1 Age Sex (Intercept) 55.7
## 2 Age Sex xMale -1.89
## 3 Age ChestPain (Intercept) 55.9
## 4 Age ChestPain xAtypical angina -4.51
## 5 Age ChestPain xNon-anginal pain -2.17
## 6 Age ChestPain xAsymptomatic -0.147
## 7 Age Cholesterol (Intercept) 45.4
## 8 Age Cholesterol x 0.0365
## 9 Age BloodSugar (Intercept) 54.0
## 10 Age BloodSugar xTRUE 3.01
## # ... with 22 more rows
-
absorb
: Fill the values from a key-value pair into a custom string template containing keys
absorb(
key = c("mean", "sd", "var"),
value = c("10", "2", "4"),
text =
c("MEAN: mean, SD: sd",
"VAR: var = sd^2",
MEAN = "mean"
)
)
## MEAN
## "MEAN: 10, SD: 2" "VAR: 4 = 2^2" "10"
-
typly
: Apply a function (or list
of functions) to columns of a data.frame
or elements of a list
that conform to one or more types
heart_disease %>%
#Compute means and medians on numeric data
typly(
c("numeric", "logical"),
list(
mean = mean,
median = median
),
keep = TRUE,
na.rm = TRUE
) %>%
#Compute table
typly(
"factor",
table,
keep = TRUE
)
## $Age
## $Age$mean
## [1] 54.43894
##
## $Age$median
## [1] 56
##
##
## $Sex
## .x
## Female Male
## 97 206
##
## $ChestPain
## .x
## Typical angina Atypical angina Non-anginal pain Asymptomatic
## 23 50 86 144
##
## $BP
## $BP$mean
## [1] 131.6898
##
## $BP$median
## [1] 130
##
##
## $Cholesterol
## $Cholesterol$mean
## [1] 246.6931
##
## $Cholesterol$median
## [1] 241
##
##
## $BloodSugar
## $BloodSugar$mean
## [1] 0.1485149
##
## $BloodSugar$median
## [1] FALSE
##
##
## $MaximumHR
## $MaximumHR$mean
## [1] 149.6073
##
## $MaximumHR$median
## [1] 153
##
##
## $ExerciseInducedAngina
## .x
## No Yes
## 204 99
##
## $HeartDisease
## .x
## No Yes
## 164 139
-
descriptives
: Gather descriptive statistics into a data.frame
heart_disease %>%
descriptives(
f_numeric =
list(
cv = function(x, na.rm) sd(x, na.rm = na.rm)/mean(x, na.rm = na.rm)
)
)
## # A tibble: 102 x 7
## .variable .key .value .label .level .order .combo
## <chr> <chr> <dbl> <chr> <chr> <int> <chr>
## 1 Age length 303 <NA> <NA> NA 303
## 2 Age missing 0 <NA> <NA> NA 0
## 3 Age available 303 <NA> <NA> NA 303
## 4 Age class NA numeric <NA> NA numeric
## 5 Age unique 41 <NA> <NA> NA 41
## 6 Age evaluated NA continuous <NA> NA continuous
## 7 Age mean 54.4 <NA> <NA> NA 54.44
## 8 Age sd 9.04 <NA> <NA> NA 9.04
## 9 Age min 29 <NA> <NA> NA 29
## 10 Age median 56 <NA> <NA> NA 56
## # ... with 92 more rows
-
univariate_associations
: Apply association functions to any number of “response” variables with any number of “predictors”
#Make a list of functions
f <-
list(
#Compute a univariate p-value
`P-value` =
function(x, y) {
if(type_match(y, c("factor", "character"))) {
p <- fisher.test(factor(x), factor(y), simulate.p.value = TRUE)$p.value
} else {
p <- kruskal.test(y, factor(x))$p.value
}
if_else(
p < 0.001, "<0.001", as.character(round(p, 2))
)
},
#Compute difference in AIC model between null model and one predictor model
`AIC Difference` =
function(x, y) {
glm(factor(x)~1, family = "binomial")$aic -
glm(factor(x)~y, family = "binomial")$aic
}
)
#1) Apply functions to Sex/HeartDisease by all other variables
heart_disease %>%
univariate_associations(
f = f,
responses = c("Sex", "HeartDisease")
)
## # A tibble: 14 x 4
## .left .variable `P-value` `AIC Difference`
## <fct> <chr> <chr> <dbl>
## 1 Sex Age 0.08 0.910
## 2 Sex ChestPain 0.08 0.958
## 3 Sex BP 0.29 -0.752
## 4 Sex Cholesterol 0.01 10.0
## 5 Sex BloodSugar 0.41 -1.29
## 6 Sex MaximumHR 0.43 -1.28
## 7 Sex ExerciseInducedAngina 0.01 4.72
## 8 HeartDisease Age <0.001 13.4
## 9 HeartDisease ChestPain <0.001 80.1
## 10 HeartDisease BP 0.03 4.95
## 11 HeartDisease Cholesterol 0.04 0.206
## 12 HeartDisease BloodSugar 0.66 -1.81
## 13 HeartDisease MaximumHR <0.001 55.1
## 14 HeartDisease ExerciseInducedAngina <0.001 56.4