欢迎大家关注全网生信学习者系列:
本教程用于对研究的数据汇总和数据探索,其中用于可视化的图形有柱状图、森林图等。
森林图是一种常用于展示多个研究结果的统计图表。它显示了各个研究的效应量(effect size)或优势比(odds ratio, OR)以及它们的95%置信区间(confidence interval, CI)。通过森林图,研究者可以快速地比较和评估不同变量对结局变量的影响。
#| warning: false
#| message: false
library(tidyverse)
library(readr)
library(magrittr)
library(qwraps2)
library(ggpubr)
library(ggsci)
library(CLME)
大家通过以下链接下载数据:
df_merge <- read_csv("./data/GMSB-data/df_merge.csv", show_col_types = FALSE)
df_seq <- read_csv("./data/GMSB-data/per-sample-fastq-counts.csv", show_col_types = FALSE)
df_v1 <- df_merge %>%
dplyr::filter(visit == "v1")
df_seq <- df_seq %>%
dplyr::filter(`Sample ID` %in% df_v1$sampleid)
评估二代测序的测序通量,在后续校正测序通量对检验结果的影响
tol_reads <- sum(df_seq$`Sequence count`)
mean_reads <- mean(df_seq$`Sequence count`)
med_reads <- median(df_seq$`Sequence count`)
bin_length <- 4000
b <- seq(0, 140000, bin_length)
p <- df_seq %>%
ggplot(aes(x = `Sequence count`)) +
geom_histogram(aes(y = ..count..), breaks = b, color = "black", fill = "lightblue1") +
geom_density(aes(y = ..density..* (nrow(df_seq) * bin_length)), color = "brown3") +
scale_x_continuous(breaks = seq(20000, 140000, 40000)) +
labs(x = "Read Number", y = "Number of Samples", title = "Reads per sample",
subtitle = paste0("Min = ", min(df_seq$`Sequence count`), " (",
df_seq$`Sample ID`[which.min(df_seq$`Sequence count`)], ") \n",
"Max = ", max(df_seq$`Sequence count`), " (",
df_seq$`Sample ID`[which.max(df_seq$`Sequence count`)], ")")) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
p
结果:测序通量
r nrow(df_seq)
r tol_reads
r mean_reads
r med_reads
不同分组的结局变量status
(seroconverter (SC) rates 和 seronegative control (NC))与其他变量的相关性分析
Primary group: 按照频率分组1
Secondary group: 按照频率分组2
recept_anal <- df_v1$recept_anal
recept_anal <- recept_anal[!is.na(recept_anal)]
print(paste0("mean = ", round(mean(recept_anal)),
", median = ", median(recept_anal),
", min = ", min(recept_anal),
", max = ", max(recept_anal)))
p_expose <- df_v1 %>%
ggplot(aes(x = recept_anal)) +
geom_histogram(aes(y = ..count..), bins = 100, color = "black", fill = "lightblue1") +
labs(x = "Number of Receptive Anal Intercourse", y = "Frequency") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
p_expose
结果:数据分布呈现偏态分布,大部分人集中在0-10之间。
options(qwraps2_markup = "markdown") # markdown
summary_template <-
list("Primary Group" =
list("G1" = ~ n_perc0(group1 == "g1", na_rm = T),
"G2" = ~ n_perc0(group1 == "g2", na_rm = T),
"G3" = ~ n_perc0(group1 == "g3", na_rm = T),
"G4" = ~ n_perc0(group1 == "g4", na_rm = T),
"Missing" = ~ n_perc0(group1 == "missing", na_rm = T)),
"Secondary Group" =
list("G1" = ~ n_perc0(group2 == "g1", na_rm = T),
"G2" = ~ n_perc0(group2 == "g2", na_rm = T),
"G3" = ~ n_perc0(group2 == "g3", na_rm = T),
"G4" = ~ n_perc0(group2 == "g4", na_rm = T),
"G5" = ~ n_perc0(group2 == "g5", na_rm = T),
"Missing" = ~ n_perc0(group2 == "missing", na_rm = T)),
"Status" =
list("Ctrl" = ~ n_perc0(status == "nc", na_rm = T),
"Case" = ~ n_perc0(status == "sc", na_rm = T),
"Missing" = ~ n_perc0(status == "missing", na_rm = T)),
"Time to Develop AIDS" =
list("< 5 years" = ~ n_perc0(time2aids == "< 5 yrs", na_rm = T),
"5 - 10 years" = ~ n_perc0(time2aids == "5 - 10 yrs", na_rm = T),
"> 10 years" = ~ n_perc0(time2aids == "> 10 yrs", na_rm = T),
"Non-progressor" = ~ n_perc0(time2aids == "never", na_rm = T),
"Missing" = ~ n_perc0(time2aids == "missing", na_rm = T)),
"Age" =
list("min" = ~ round(min(age, na.rm = T), 2),
"max" = ~ round(max(age, na.rm = T), 2),
"mean (sd)" = ~ qwraps2::mean_sd(age, na_rm = T, show_n = "never")),
"Race" =
list("White" = ~ n_perc0(race == "white", na_rm = T),
"Black" = ~ n_perc0(race == "black", na_rm = T),
"Others" = ~ n_perc0(race == "others", na_rm = T),
"Missing" = ~ n_perc0(race == "missing", na_rm = T)),
"Education" =
list("Postgrad" = ~ n_perc0(educa == "postgrad", na_rm = T),
"Undergrad" = ~ n_perc0(educa == "undergrad", na_rm = T),
"No degree" = ~ n_perc0(educa == "no degree", na_rm = T),
"Missing" = ~ n_perc0(educa == "missing", na_rm = T)),
"Smoking status" =
list("Never" = ~ n_perc0(smoke == "never", na_rm = T),
"Former" = ~ n_perc0(smoke == "former", na_rm = T),
"Current" = ~ n_perc0(smoke == "current", na_rm = T),
"Missing" = ~ n_perc0(smoke == "missing", na_rm = T)),
"Drinking status" =
list("None" = ~ n_perc0(drink == "none", na_rm = T),
"Low/Moderate" = ~ n_perc0(drink == "low/moderate", na_rm = T),
"Moderate/Heavy" = ~ n_perc0(drink == "moderate/heavy", na_rm = T),
"Binge" = ~ n_perc0(drink == "binge", na_rm = T),
"Missing" = ~ n_perc0(drink == "missing", na_rm = T)),
"Antibiotics Usage" =
list("No" = ~ n_perc0(abx_use == "no", na_rm = T),
"Yes" = ~ n_perc0(abx_use == "yes", na_rm = T),
"Missing" = ~ n_perc0(abx_use == "missing", na_rm = T)),
"STD" =
list("No" = ~ n_perc0(std == "no", na_rm = T),
"Yes" = ~ n_perc0(std == "yes", na_rm = T),
"Missing" = ~ n_perc0(std == "missing", na_rm = T)),
"Substance use" =
list("No" = ~ n_perc0(druguse == "no", na_rm = T),
"Yes" = ~ n_perc0(druguse == "yes", na_rm = T),
"Missing" = ~ n_perc0(druguse == "missing", na_rm = T)),
"HBV" =
list("Negative" = ~ n_perc0(hbv == "negative", na_rm = T),
"Resolved" = ~ n_perc0(hbv == "resolved", na_rm = T),
"Positive" = ~ n_perc0(hbv == "positive", na_rm = T),
"Missing" = ~ n_perc0(hbv == "missing", na_rm = T)),
"HCV" =
list("Negative" = ~ n_perc0(hcv == "negative", na_rm = T),
"Positive" = ~ n_perc0(hcv == "positive", na_rm = T),
"Missing" = ~ n_perc0(hcv == "missing", na_rm = T))
)
tab <- df_v1 %>%
qwraps2::summary_table(summary_template)
# print(tab, rtitle = "Demographic information at the baseline visit")
knitr::kable(tab, caption = "Demographic information at the baseline visit")
# Tidy up the baseline data
df_v1 <- df_v1 %>%
mutate(
drink = case_when(
drink %in% c("none", "low/moderate") ~ "low",
drink %in% c("moderate/heavy", "binge") ~ "heavy",
TRUE ~ "missing"
)
)
options(qwraps2_markup = "markdown")
summary_template <-
list("Primary Group" =
list("G1" = ~ n_perc0(group1 == "g1", na_rm = T),
"G2" = ~ n_perc0(group1 == "g2", na_rm = T),
"G3" = ~ n_perc0(group1 == "g3", na_rm = T),
"G4" = ~ n_perc0(group1 == "g4", na_rm = T),
"Missing" = ~ n_perc0(group1 == "missing", na_rm = T)),
"Secondary Group" =
list("G1" = ~ n_perc0(group2 == "g1", na_rm = T),
"G2" = ~ n_perc0(group2 == "g2", na_rm = T),
"G3" = ~ n_perc0(group2 == "g3", na_rm = T),
"G4" = ~ n_perc0(group2 == "g4", na_rm = T),
"G5" = ~ n_perc0(group2 == "g5", na_rm = T),
"Missing" = ~ n_perc0(group2 == "missing", na_rm = T)),
"Status" =
list("Ctrl" = ~ n_perc0(status == "nc", na_rm = T),
"Case" = ~ n_perc0(status == "sc", na_rm = T),
"Missing" = ~ n_perc0(status == "missing", na_rm = T)),
"Time to Develop AIDS" =
list("< 5 years" = ~ n_perc0(time2aids == "< 5 yrs", na_rm = T),
"5 - 10 years" = ~ n_perc0(time2aids == "5 - 10 yrs", na_rm = T),
"> 10 years" = ~ n_perc0(time2aids == "> 10 yrs", na_rm = T),
"Non-progressor" = ~ n_perc0(time2aids == "never", na_rm = T),
"Missing" = ~ n_perc0(time2aids == "missing", na_rm = T)),
"Age" =
list("min" = ~ round(min(age, na.rm = T), 2),
"max" = ~ round(max(age, na.rm = T), 2),
"mean (sd)" = ~ qwraps2::mean_sd(age, na_rm = T, show_n = "never")),
"Race" =
list("White" = ~ n_perc0(race == "white", na_rm = T),
"Black" = ~ n_perc0(race == "black", na_rm = T),
"Others" = ~ n_perc0(race == "others", na_rm = T),
"Missing" = ~ n_perc0(race == "missing", na_rm = T)),
"Education" =
list("Postgrad" = ~ n_perc0(educa == "postgrad", na_rm = T),
"Undergrad" = ~ n_perc0(educa == "undergrad", na_rm = T),
"No degree" = ~ n_perc0(educa == "no degree", na_rm = T),
"Missing" = ~ n_perc0(educa == "missing", na_rm = T)),
"Smoking status" =
list("Never" = ~ n_perc0(smoke == "never", na_rm = T),
"Former" = ~ n_perc0(smoke == "former", na_rm = T),
"Current" = ~ n_perc0(smoke == "current", na_rm = T),
"Missing" = ~ n_perc0(smoke == "missing", na_rm = T)),
"Drinking status" =
list("Low" = ~ n_perc0(drink == "low", na_rm = T),
"Heavy" = ~ n_perc0(drink == "heavy", na_rm = T),
"Missing" = ~ n_perc0(drink == "missing", na_rm = T)),
"Antibiotics Usage" =
list("No" = ~ n_perc0(abx_use == "no", na_rm = T),
"Yes" = ~ n_perc0(abx_use == "yes", na_rm = T),
"Missing" = ~ n_perc0(abx_use == "missing", na_rm = T)),
"STD" =
list("No" = ~ n_perc0(std == "no", na_rm = T),
"Yes" = ~ n_perc0(std == "yes", na_rm = T),
"Missing" = ~ n_perc0(std == "missing", na_rm = T)),
"Substance use" =
list("No" = ~ n_perc0(druguse == "no", na_rm = T),
"Yes" = ~ n_perc0(druguse == "yes", na_rm = T),
"Missing" = ~ n_perc0(druguse == "missing", na_rm = T)),
"HBV" =
list("Negative" = ~ n_perc0(hbv == "negative", na_rm = T),
"Resolved" = ~ n_perc0(hbv == "resolved", na_rm = T),
"Positive" = ~ n_perc0(hbv == "positive", na_rm = T),
"Missing" = ~ n_perc0(hbv == "missing", na_rm = T)),
"HCV" =
list("Negative" = ~ n_perc0(hcv == "negative", na_rm = T),
"Positive" = ~ n_perc0(hcv == "positive", na_rm = T),
"Missing" = ~ n_perc0(hcv == "missing", na_rm = T))
)
tab <- df_v1 %>%
summary_table(summary_template)
# print(tab, rtitle = "Demographic information at the baseline visit")
knitr::kable(tab, caption = "Demographic information at the baseline visit")
结局变量status
(seroconverter (SC) rates 和 seronegative control (NC))与其他变量的相关关系,通过glm
广义线性模型计算获得,并使用森林图展示结果。
df <- df_v1 %>%
filter(!is.na(age), educa != "missing", abx_use != "missing",
std != "missing", smoke != "missing", drink != "missing",
druguse != "missing", hbv != "missing",
status != "missing")
df$status <- factor(df$status, levels = c("nc", "sc"))
df$educa <- factor(df$educa, levels = c("no degree", "undergrad", "postgrad"))
df$abx_use <- factor(df$abx_use, levels = c("no", "yes"))
df$std <- factor(df$std, levels = c("no", "yes"))
df$smoke <- factor(df$smoke, levels = c("never", "former", "current"))
df$drink <- factor(df$drink, levels = c("low", "heavy"))
df$druguse <- factor(df$druguse, levels = c("no", "yes"))
df$hbv <- factor(df$hbv, levels = c("negative", "resolved", "positive"))
model <- glm(status ~ age + educa + abx_use + std + smoke + drink +
druguse + hbv, data = df, family = binomial)
tidy_model <- broom::tidy(model) %>%
filter(term != "(Intercept)") %>%
mutate(index = seq_len(length(coef(model)) - 1),
label = c("Age", "Educa (Undergrad)", "Educa (Postgrad)",
"Abx Use (Yes)", "STD (Yes)",
"Smoke (Former)", "Smoke (Current)",
"Drink (Heavy)", "Substance Use (Yes)", "HBV (Resolved)", "HBV (Positive)"),
label_factor = factor(label, levels = rev(sort(label))),
p_value = round(p.value, 2),
OR = exp(estimate),
LL = exp(estimate - 1.96*std.error),
UL = exp(estimate + 1.96*std.error)) %>%
rowwise() %>%
mutate(CI = paste(round(LL, 2), round(UL, 2), sep = ", ")) %>%
ungroup()
p_forest <- tidy_model %>%
ggplot(aes(y = label_factor, x = OR)) +
geom_vline(xintercept = 1, color = "red", linetype = "dashed", linewidth = 1, alpha = 0.5) +
geom_errorbarh(aes(xmin = LL, xmax = UL), height = 0.25) +
geom_point(shape = 18, size = 5) +
scale_x_log10(breaks = c(0.1, 0.2, 0.5, 1, 2, 5, 10),
labels = c("0.1", "0.2", "0.5", "1", "2", "5", "10")) +
scale_y_discrete(name = "") +
labs(x = "Odds Ratio (95% CI)", y = "") +
theme_bw() +
theme(panel.border = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
axis.text.y = element_text(size = 12, colour = "black"),
axis.text.x.bottom = element_text(size = 12, colour = "black"),
axis.title.x = element_text(size = 12, colour = "black")) +
ggtitle("")
table_base <- tidy_model %>%
ggplot(aes(y = label_factor)) +
labs(x = "", y = "") +
theme(plot.title = element_text(hjust = 0.5, size = 12),
axis.text.x = element_text(color = "white", hjust = -3, size = 25),
axis.line = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.y = element_blank(),
legend.position = "none",
panel.background = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.background = element_blank())
tab_or <- table_base +
geom_text(aes(y = label_factor, x = 1,
label = sprintf("%0.1f", round(OR, digits = 1))), size = 4) +
ggtitle("OR")
tab_ci <- table_base +
geom_text(aes(y = label_factor, x = 1, label = CI), size = 4) +
ggtitle("95% CI")
tab_p <- table_base +
geom_text(aes(y = label_factor, x = 1, label = p_value), size = 4) +
ggtitle("P")
p_out_var_prim <- ggarrange(
p_forest, tab_or, tab_ci, tab_p,
ncol = 4, nrow = 1,
widths = c(10, 1, 2, 1), align = "h")
p_out_var_prim
结果:森林图展示其他变量与结局变量status
的相关关系
使用Constrained Inference for Linear Mixed Effects Models(线性混合效应模型的约束推理)方法解析primary group
变量和status
结局变量的关系。
线性混合效应模型(Linear Mixed-Effects Models,简称LMMs)是一种用于分析具有多层次或嵌套结构数据的统计模型。这种模型可以处理数据中的固定效应(fixed effects)和随机效应(random effects),使得研究者能够考虑和控制数据中的非独立性(non-independence)。在传统的线性模型中,参数估计通常是通过最小化残差平方和来完成的,这被称为无约束估计(unconstrained estimation)。然而,在某些情况下,我们可能希望对模型参数施加一些约束条件,比如参数的正性、非负性或其他类型的线性约束。这就是所谓的约束推理。
与传统线性模型的不同。
df_fig <- df_v1 %>%
filter(group1 != "missing",
status != "missing") %>%
group_by(group1) %>%
summarise(total = n(),
nc_num = sum(status == "nc"),
sc_num = sum(status == "sc")) %>%
dplyr::mutate(perc_nc = round(nc_num/total * 100, 2),
perc_sc = round(sc_num/total * 100, 2)) %>%
dplyr::select(group1, perc_nc, perc_sc) %>%
pivot_longer(cols = perc_nc:perc_sc, names_to = "perc", values_to = "value")
cons <- list(order = "simple", decreasing = FALSE, node = 1)
df_test <- df_v1 %>%
filter(group1 != "missing",
status != "missing") %>%
transmute(group1 = group1,
status = ifelse(status == "nc", 0, 1))
df_test$group1 <- factor(df_test$group1, ordered = TRUE)
fit_clme <- clme(status ~ group1, data = df_test, constraints = cons, seed = 123)
fit_summ <- summary(fit_clme)
fit_summ
p_val <- fit_summ$p.value
p_lab <- ifelse(p_val == 0, "p < 0.001", paste0("p = ", p_val))
test_lab <- paste0("LRT = ", signif(fit_summ$ts.glb, 2))
lab <- paste0(test_lab, ", ", p_lab)
df_fig$group1 <- recode(df_fig$group1,
`g1` = "G1", `g2` = "G2",
`g3` = "G3", `g4` = "G4")
p_summ_prim <- df_fig %>%
ggbarplot(x = "group1", y = "value",
fill = "perc", color = "black", palette = "Paired",
xlab = "", ylab = "Percentage (%)",
label = TRUE, lab.col = "white", lab.pos = "in") +
scale_y_continuous(breaks = c(0, 20, 40, 60, 80, 100), limits = c(0, 110)) +
geom_text(aes(x = 3.5, y = 110, label = lab)) +
scale_fill_aaas(name = NULL, labels = c("Ctrl", "Case"))
p_summ_prim
Linear mixed model subject to order restrictions
Formula: status ~ group1 - 1
Order specified: increasing simple order
log-likelihood: -134.1
AIC: 278.1
BIC: 286.4
(log-likelihood, AIC, BIC computed under normality)
Global test:
Contrast Statistic p-value
Bootstrap LRT 0.002 0.0000
Individual Tests (Williams' type tests):
Contrast Estimate Statistic p-value
g2 - g1 0.251 3.249 0.0000
g3 - g2 0.178 2.476 0.0000
g4 - g3 0.266 3.145 0.0000
Variance components:
Std. Error
Residual 0.422039
Fixed effect coefficients (theta):
Estimate Std. Err 95% lower 95% upper
g1 0.04762 0.05317 -0.057 0.152
g2 0.29825 0.05590 0.189 0.408
g3 0.47674 0.04551 0.388 0.566
g4 0.74286 0.07134 0.603 0.883
Std. Errors and confidence limits based on unconstrained covariance matrix
Parameters are ordered according to the following factor levels:
g1, g2, g3, g4
结果:Constrained Inference for Linear Mixed Effects Models的结果(排序)
暴露变量Exposure (primary group)
与其他变量的相关关系,通过Ordered Logistic or Probit Regression
线性模型计算获得,并使用森林图展示结果。
Ordered Logistic Regression(有序逻辑回归)和Ordered Probit Regression(有序Probit回归)是两种用于分析有序分类数据的统计方法。有序分类数据是指数据的类别具有自然的顺序或等级,例如,调查问卷中的满意度评级(非常不满意、不满意、一般、满意、非常满意)。
df <- df_v1 %>%
filter(!is.na(age), educa != "missing", abx_use != "missing",
std != "missing", smoke != "missing", drink != "missing",
druguse != "missing", hbv != "missing",
group1 != "missing")
df$group1 <- factor(df$group1, levels = c("g1", "g2", "g3", "g4"))
df$educa <- factor(df$educa, levels = c("no degree", "undergrad", "postgrad"))
df$abx_use <- factor(df$abx_use, levels = c("no", "yes"))
df$std <- factor(df$std, levels = c("no", "yes"))
df$smoke <- factor(df$smoke, levels = c("never", "former", "current"))
df$drink <- factor(df$drink, levels = c("low", "heavy"))
df$druguse <- factor(df$druguse, levels = c("no", "yes"))
df$hbv <- factor(df$hbv, levels = c("negative", "resolved", "positive"))
model <- polr(group1 ~ age + educa + abx_use + std + smoke + drink +
druguse + hbv, data = df, Hess = TRUE, method = "logistic")
coefs <- summary(model)$coefficients
wald_stat <- coefs[, "Value"] / coefs[, "Std. Error"]
wald_pvalue <- 2 * pnorm(abs(wald_stat), lower.tail = FALSE)
wald_pvalue <- wald_pvalue[!names(wald_pvalue) %in% c("g1|g2", "g2|g3", "g3|g4")]
tidy_model <- broom::tidy(model) %>%
filter(!term %in% c("g1|g2", "g2|g3", "g3|g4")) %>%
mutate(label = c("Age", "Educa (Undergrad)", "Educa (Postgrad)",
"Abx Use (Yes)", "STD (Yes)",
"Smoke (Former)", "Smoke (Current)",
"Drink (Heavy)", "Substance Use (Yes)", "HBV (Resolved)", "HBV (Positive)"),
label_factor = factor(label, levels = rev(sort(label))),
p_value = round(wald_pvalue, 2),
OR = exp(estimate),
LL = exp(estimate - 1.96*std.error),
UL = exp(estimate + 1.96*std.error)) %>%
rowwise() %>%
mutate(CI = paste(round(LL, 2), round(UL, 2), sep = ", ")) %>%
ungroup()
p_forest <- tidy_model %>%
ggplot(aes(y = label_factor, x = OR)) +
geom_vline(xintercept = 1, color = "red", linetype = "dashed", cex = 1, alpha = 0.5) +
geom_errorbarh(aes(xmin = LL, xmax = UL), height = 0.25) +
geom_point(shape = 18, size = 5) +
scale_x_log10(breaks = c(0.1, 0.2, 0.5, 1, 2, 5, 10),
labels = c("0.1", "0.2", "0.5", "1", "2", "5", "10")) +
scale_y_discrete(name = "") +
labs(x = "Odds Ratio (95% CI)", y = "") +
theme_bw() +
theme(panel.border = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
axis.text.y = element_text(size = 12, colour = "black"),
axis.text.x.bottom = element_text(size = 12, colour = "black"),
axis.title.x = element_text(size = 12, colour = "black")) +
ggtitle("")
table_base <- tidy_model %>%
ggplot(aes(y = label_factor)) +
labs(x = "", y = "") +
theme(plot.title = element_text(hjust = 0.5, size = 12),
axis.text.x = element_text(color = "white", hjust = -3, size = 25),
axis.line = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.y = element_blank(),
legend.position = "none",
panel.background = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.background = element_blank())
tab_or <- table_base +
geom_text(aes(y = label_factor, x = 1,
label = sprintf("%0.1f", round(OR, digits = 1))), size = 4) +
ggtitle("OR")
tab_ci <- table_base +
geom_text(aes(y = label_factor, x = 1, label = CI), size = 4) +
ggtitle("95% CI")
tab_p <- table_base +
geom_text(aes(y = label_factor, x = 1, label = p_value), size = 4) +
ggtitle("P")
p_exp_var_prim <- ggarrange(
p_forest, tab_or, tab_ci, tab_p,
ncol = 4, nrow = 1,
widths = c(10, 1, 2, 1), align = "h")
p_exp_var_prim
使用Constrained Inference for Linear Mixed Effects Models(线性混合效应模型的约束推理)方法解析secondary group
变量和status
结局变量的关系。
df_fig <- df_v1 %>%
filter(group2 != "missing",
status != "missing") %>%
group_by(group2) %>%
summarise(total = n(),
nc_num = sum(status == "nc"),
sc_num = sum(status == "sc")) %>%
dplyr::mutate(perc_nc = round(nc_num/total * 100, 2),
perc_sc = round(sc_num/total * 100, 2)) %>%
dplyr::select(group2, perc_nc, perc_sc) %>%
pivot_longer(cols = perc_nc:perc_sc, names_to = "perc", values_to = "value")
cons <- list(order = "simple", decreasing = FALSE, node = 1)
df_test <- df_v1 %>%
filter(group2 != "missing",
status != "missing") %>%
transmute(group2 = group2,
status = ifelse(status == "nc", 0, 1))
df_test$group2 <- factor(df_test$group2, ordered = TRUE)
fit_clme <- clme(status ~ group2, data = df_test, constraints = cons, seed = 123)
fit_summ <- summary(fit_clme)
fit_summ
p_val <- fit_summ$p.value
p_lab <- ifelse(p_val == 0, "p < 0.001", paste0("p = ", p_val))
df_fig$group2 <- recode(df_fig$group2,
`g1` = "G1", `g2` = "G2",
`g3` = "G3", `g4` = "G4", `g5` = "G5")
p_summ_second <- df_fig %>%
ggbarplot(x = "group2", y = "value",
fill = "perc", color = "black", palette = "Paired",
xlab = "", ylab = "Percentage (%)",
label = TRUE, lab.col = "white", lab.pos = "in") +
scale_y_continuous(breaks = c(0, 20, 40, 60, 80, 100), limits = c(0, 110)) +
geom_text(aes(x = 3.5, y = 110, label = p_lab)) +
scale_fill_aaas(name = NULL, labels = c("Ctrl", "Case"))
p_summ_second
Linear mixed model subject to order restrictions
Formula: status ~ group2 - 1
Order specified: increasing simple order
log-likelihood: -136.2
AIC: 284.5
BIC: 294.4
(log-likelihood, AIC, BIC computed under normality)
Global test:
Contrast Statistic p-value
Bootstrap LRT 0.003 0.0000
Individual Tests (Williams' type tests):
Contrast Estimate Statistic p-value
g2 - g1 0.251 3.219 0.0000
g3 - g2 0.212 2.645 0.0000
g4 - g3 0.000 0.000 1.0000
g5 - g4 0.229 2.072 0.0050
Variance components:
Std. Error
Residual 0.4258823
Fixed effect coefficients (theta):
Estimate Std. Err 95% lower 95% upper
g1 0.04762 0.05366 -0.058 0.153
g2 0.29825 0.05641 0.188 0.409
g3 0.51020 0.05691 0.399 0.622
g4 0.51020 0.06572 0.381 0.639
g5 0.73913 0.08880 0.565 0.913
Std. Errors and confidence limits based on unconstrained covariance matrix
Parameters are ordered according to the following factor levels:
g1, g2, g3, g4, g5
df <- df_v1 %>%
filter(!is.na(age), educa != "missing", abx_use != "missing",
std != "missing", smoke != "missing", drink != "missing",
druguse != "missing", hbv != "missing",
group2 != "missing")
df$group2 <- factor(df$group2, levels = c("g1", "g2", "g3", "g4", "g5"))
df$educa <- factor(df$educa, levels = c("no degree", "undergrad", "postgrad"))
df$abx_use <- factor(df$abx_use, levels = c("no", "yes"))
df$std <- factor(df$std, levels = c("no", "yes"))
df$smoke <- factor(df$smoke, levels = c("never", "former", "current"))
df$drink <- factor(df$drink, levels = c("low", "heavy"))
df$druguse <- factor(df$druguse, levels = c("no", "yes"))
df$hbv <- factor(df$hbv, levels = c("negative", "resolved", "positive"))
model <- polr(group2 ~ age + educa + abx_use + std + smoke + drink +
druguse + hbv, data = df, Hess = TRUE, method = "logistic")
coefs <- summary(model)$coefficients
wald_stat <- coefs[, "Value"] / coefs[, "Std. Error"]
wald_pvalue <- 2 * pnorm(abs(wald_stat), lower.tail = FALSE)
wald_pvalue <- wald_pvalue[!names(wald_pvalue) %in% c("g1|g2", "g2|g3", "g3|g4", "g4|g5")]
tidy_model <- broom::tidy(model) %>%
filter(!term %in% c("g1|g2", "g2|g3", "g3|g4", "g4|g5")) %>%
mutate(label = c("Age", "Educa (Undergrad)", "Educa (Postgrad)",
"Abx Use (Yes)", "STD (Yes)",
"Smoke (Former)", "Smoke (Current)",
"Drink (Heavy)", "Substance Use (Yes)", "HBV (Resolved)", "HBV (Positive)"),
label_factor = factor(label, levels = rev(sort(label))),
p_value = round(wald_pvalue, 2),
OR = exp(estimate),
LL = exp(estimate - 1.96*std.error),
UL = exp(estimate + 1.96*std.error)) %>%
rowwise() %>%
mutate(CI = paste(round(LL, 2), round(UL, 2), sep = ", ")) %>%
ungroup()
p_forest <- tidy_model %>%
ggplot(aes(y = label_factor, x = OR)) +
geom_vline(xintercept = 1, color = "red", linetype = "dashed", cex = 1, alpha = 0.5) +
geom_errorbarh(aes(xmin = LL, xmax = UL), height = 0.25) +
geom_point(shape = 18, size = 5) +
scale_x_log10(breaks = c(0.1, 0.2, 0.5, 1, 2, 5, 10),
labels = c("0.1", "0.2", "0.5", "1", "2", "5", "10")) +
scale_y_discrete(name = "") +
labs(x = "Odds Ratio (95% CI)", y = "") +
theme_bw() +
theme(panel.border = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
axis.text.y = element_text(size = 12, colour = "black"),
axis.text.x.bottom = element_text(size = 12, colour = "black"),
axis.title.x = element_text(size = 12, colour = "black")) +
ggtitle("")
table_base <- tidy_model %>%
ggplot(aes(y = label_factor)) +
labs(x = "", y = "") +
theme(plot.title = element_text(hjust = 0.5, size = 12),
axis.text.x = element_text(color = "white", hjust = -3, size = 25),
axis.line = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.y = element_blank(),
legend.position = "none",
panel.background = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
plot.background = element_blank())
tab_or <- table_base +
geom_text(aes(y = label_factor, x = 1,
label = sprintf("%0.1f", round(OR, digits = 1))), size = 4) +
ggtitle("OR")
tab_ci <- table_base +
geom_text(aes(y = label_factor, x = 1, label = CI), size = 4) +
ggtitle("95% CI")
tab_p <- table_base +
geom_text(aes(y = label_factor, x = 1, label = p_value), size = 4) +
ggtitle("P")
p_exp_var_second <- ggarrange(
p_forest, tab_or, tab_ci, tab_p,
ncol = 4, nrow = 1,
widths = c(10, 1, 2, 1), align = "h")
p_exp_var_second
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。
原创声明:本文系作者授权腾讯云开发者社区发表,未经许可,不得转载。
如有侵权,请联系 cloudcommunity@tencent.com 删除。