我试图在一个图上添加分组行,其中的值是从另一个dataframe中计算的。
我的数据如下:
demo_df <- structure(list(Sample_Run = structure(c(3L, 4L, 2L, 1L, 5L, 3L,
1L, 4L, 5L, 2L, 3L, 2L, 1L, 5L, 4L), .Label = c("Sample1", "Sample2",
"Sample3", "Sample4", "Sample5", "Sample6", "Sample7", "Sample8",
"Sample9", "Sample10", "Sample11", "Sample12", "Sample13", "Sample14",
"Sample15", "Sample16", "Sample17", "Sample18", "Sample19", "Sample20"
), class = "factor"), MUT_ID = structure(c(1L, 1L, 1L, 1L, 1L,
3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L), .Label = c("AKT1 c.49G>A",
"APC c.4348C>T", "APC c.4666_4667insA", "ATM c.1058_1059delGT",
"BRAF c.1799T>A", "CTNNB1 c.121A>G", "EGFR c.2236_2250del15",
"EGFR c.2310_2311insGGT", "EGFR c.2369C>T", "EGFR c.2573T>G",
"ERBB2 c.2324_2325ins12", "FGFR3 c.746C>G", "FLT3 c.2503G>T",
"GNA11 c.626A>T", "GNAQ c.626A>C", "GNAS c.601C>T", "JAK2 c.1849G>T",
"KIT c.2447A>T", "KRAS c.35G>A", "MPL c.1544G>T", "NPM1 c.863_864insTCTG",
"NRAS c.182A>G", "PDGFRA c.1694_1695insA", "PDGFRA c.2525A>T",
"PIK3CA c.1633G>A", "PIK3CA c.3140A>G", "PIK3CA c.3204_3205insA",
"PTEN c.741_742insA", "PTEN c.800delA", "RET c.2753T>C", "SMAD4 c.1394_1395insT",
"TP53 c.524G>A", "TP53 c.723delC", "TP53 c.743G>A", "TP53 c.818G>A"
), class = "factor"), FREQ = c(0.091, 0.077, 0.09, 0.096, 0.114,
0.081, 0.071, 0.076, 0.084, 0.083, 0.08, 0.082, 0.087, 0.085,
0.094)), .Names = c("Sample_Run", "MUT_ID", "FREQ"), row.names = c(1L,
4L, 5L, 7L, 8L, 46L, 47L, 48L, 50L, 51L, 91L, 93L, 94L, 96L,
97L), class = "data.frame")
demo_sd <- aggregate(demo_df[["FREQ"]], by=list(as.factor(demo_df[["MUT_ID"]])), FUN=sd)
names(demo_sd) <- c("MUT_ID", "sd")
demo_mean <- aggregate(demo_df[["FREQ"]], by=list(as.factor(demo_df[["MUT_ID"]])), FUN=mean)
names(demo_mean) <- c("MUT_ID", "mean")
demo_mean_sd <- merge(x = demo_sd, y = demo_mean)
demo_mean_sd[["sd_interval_upper"]] <- demo_mean_sd[["mean"]] + (2 * demo_mean_sd[["sd"]])
demo_mean_sd[["sd_interval_lower"]] <- demo_mean_sd[["mean"]] - (2 * demo_mean_sd[["sd"]])
产出:
> demo_df
Sample_Run MUT_ID FREQ
1 Sample3 AKT1 c.49G>A 0.091
4 Sample4 AKT1 c.49G>A 0.077
5 Sample2 AKT1 c.49G>A 0.090
7 Sample1 AKT1 c.49G>A 0.096
8 Sample5 AKT1 c.49G>A 0.114
46 Sample3 APC c.4666_4667insA 0.081
47 Sample1 APC c.4666_4667insA 0.071
48 Sample4 APC c.4666_4667insA 0.076
50 Sample5 APC c.4666_4667insA 0.084
51 Sample2 APC c.4666_4667insA 0.083
91 Sample3 APC c.4348C>T 0.080
93 Sample2 APC c.4348C>T 0.082
94 Sample1 APC c.4348C>T 0.087
96 Sample5 APC c.4348C>T 0.085
97 Sample4 APC c.4348C>T 0.094
> demo_mean_sd
MUT_ID sd mean sd_interval_upper sd_interval_lower
1 AKT1 c.49G>A 0.013390295 0.0936 0.12038059 0.06681941
2 APC c.4348C>T 0.005412947 0.0856 0.09642589 0.07477411
3 APC c.4666_4667insA 0.005431390 0.0790 0.08986278 0.06813722
我可以做这样的基本情节:
library("ggplot2")
ggplot(data = demo_df,
aes(y = FREQ, x = Sample_Run, color = MUT_ID, group = MUT_ID) ) +
geom_point() +
geom_line(alpha = 0.3) +
facet_grid(MUT_ID~.) +
scale_y_continuous(limits = c(0, NA))
看起来是这样的:
但是,我需要为demo_mean_sd
数据的标准差和平均值添加行。它应该是这样的:
但是,由于缺少一个共同的x轴,我一直遇到一些问题,试图在图上画线。例如,我尝试了这样的方法:
ggplot(data = demo_df,
aes(y = FREQ, x = Sample_Run, color = MUT_ID, group = MUT_ID) ) +
geom_point() +
geom_line(alpha = 0.3) +
facet_grid(MUT_ID~.) +
scale_y_continuous(limits = c(0, NA)) +
geom_ribbon(data = demo_mean_sd, aes(ymin = sd_interval_lower, ymax = sd_interval_upper))
错误:
Error in eval(expr, envir, enclos) : object 'Sample_Run' not found
我一直无法找到如何与其他情节类型做这件事,这也是由于面。
发布于 2018-04-19 14:49:41
一种方法是在创建绘图之前合并数据帧。您可以使用dplyr与以下内容在MUT_ID上合并:
demo_df_merged <-
demo_df %>%
left_join(., demo_mean_sd, by = "MUT_ID")
一旦合并,您可以绘制上、下界以及平均值,通常使用geom_line。
ggplot(data = demo_df_merged,
aes(x = Sample_Run, color = MUT_ID, group = MUT_ID) ) +
geom_point(aes(y = FREQ)) +
geom_line(alpha = 0.3, aes(y = FREQ)) +
geom_line(aes(y = mean)) +
geom_line(size = 2, aes(y = sd_interval_upper)) +
geom_line(size = 2, aes(y = sd_interval_lower)) +
facet_grid(MUT_ID~.) +
scale_y_continuous(limits = c(0, NA))
发布于 2018-04-19 15:10:28
首先,当所有数据都是一个长格式的数据时,ggplot2
工作得最好,所以我将避免在第一个实例中创建更多的数据帧。
第二,我会以不同的方式看待这些数据。丝带表示沿x轴的某种进展,通常是时间,这不是这样的情况。如果您对突变的复制方式感兴趣,我建议x轴应该是MUT_ID
,复制应该是点,然后使用stat_summary
添加带错误条的平均值。
就像这样:
library(tidyverse)
# mean_sdl gives you the mean + 2 SD
demo_df %>%
ggplot(aes(MUT_ID, FREQ)) +
geom_jitter(aes(color = Sample_Run),
width = 0.2,
size = 2) +
stat_summary(fun.data = mean_sdl,
geom = "errorbar",
width = 0.2) +
stat_summary(fun.y = mean,
geom = "point",
fill = "red",
size = 4,
shape = 23) +
theme_bw()
https://stackoverflow.com/questions/49931085
复制相似问题