代码之家 › 专栏 › 技术社区 › Programming Noob

计算散点图中的标签数量

scatter-plot plot ggplot2 r

0

Programming Noob · 技术社区 · 2 年前

我有三个散点图,在三种癌症类型的两个数字连续列之间进行比较。两列中的每一行都属于癌症类型。

以下是一小部分数据:

structure(list(cancer_type = c("Renal Cell Carcinoma", "Melanoma", 
"Renal Cell Carcinoma", "Renal Cell Carcinoma", "Melanoma", "Renal Cell Carcinoma", 
"Melanoma", "Renal Cell Carcinoma", "Melanoma", "Renal Cell Carcinoma", 
"Renal Cell Carcinoma", "Renal Cell Carcinoma", "Melanoma", "Renal Cell Carcinoma", 
"Renal Cell Carcinoma", "Renal Cell Carcinoma", "Melanoma", "Renal Cell Carcinoma", 
"Renal Cell Carcinoma", "Renal Cell Carcinoma", "Melanoma", "Renal Cell Carcinoma", 
"Renal Cell Carcinoma", "Renal Cell Carcinoma", "Renal Cell Carcinoma", 
"Urothelial Carcinoma", "Urothelial Carcinoma", "Urothelial Carcinoma", 
"Urothelial Carcinoma", "Urothelial Carcinoma", "Urothelial Carcinoma", 
"Urothelial Carcinoma", "Urothelial Carcinoma", "Urothelial Carcinoma", 
"Urothelial Carcinoma", "Urothelial Carcinoma"), Model1 = c(0.144175127148628, 
0.145591989159584, 0.0984272509813309, 0.0906868129968643, 0.28544145822525, 
0.138114541769028, 0.091837003827095, 0.0904595032334328, 0.211963757872581, 
0.163982316851616, 0.0935302376747131, 0.127466395497322, 0.117602989077568, 
0.18533518910408, 0.0753359571099281, 0.157020777463913, 0.211388036608696, 
0.0914847329258919, 0.177859485149384, 0.137649402022362, 0.240238919854164, 
0.10163140296936, 0.128856286406517, 0.1811293810606, 0.145569115877151, 
0.108640238642693, 0.157251104712486, 0.141889616847038, 0.0737133473157883, 
0.140953287482262, 0.196182891726494, 0.135421812534332, 0.174105599522591, 
0.0961336940526962, 0.0573264583945274, 0.0880825147032738), 
    Model2 = c(0.314525783061981, 0.343217849731445, 0.315391361713409, 
    0.353350460529327, 0.562197327613831, 0.292534917593002, 
    0.616392850875854, 0.284660279750824, 0.532478809356689, 
    0.341239869594574, 0.35737070441246, 0.31985279917717, 0.619661331176758, 
    0.224026560783386, 0.268743008375168, 0.344117254018784, 
    0.542939126491547, 0.267527014017105, 0.2816281914711, 0.443801760673523, 
    0.552633106708527, 0.387285768985748, 0.186705753207207, 
    0.234086975455284, 0.287418365478516, 0.564366817474365, 
    0.392496168613434, 0.642540812492371, 0.688632488250732, 
    0.430574655532837, 0.360769122838974, 0.58690744638443, 0.510010659694672, 
    0.559859037399292, 0.665197253227234, 0.460800766944885), 
    stage = c("STAGE1", "STAGE1", "STAGE2", "STAGE4", "STAGE4", 
    "STAGE4", "STAGE3", "STAGE3", "STAGE1", "STAGE2", "STAGE2", 
    "STAGE1", "STAGE1", "STAGE3", "STAGE4", "STAGE1", "STAGE1", 
    "STAGE1", "STAGE1", "STAGE3", "STAGE2", "STAGE4", "STAGE4", 
    "STAGE1", "STAGE1", "STAGE1", "STAGE2", "STAGE4", "STAGE3", 
    "STAGE3", "STAGE2", "STAGE1", "STAGE1", "STAGE4", "STAGE3", 
    "STAGE2")), class = "data.frame", row.names = c("04d83340b8bd", 
"122T", "1c2a5ac94492", "1d209304d988", "212T", "24ab7fecc92e", 
"356T", "379fe8924c51", "39T", "3ec4d3fc8bd1", "3f78044299b5", 
"4260f878a482", "430T", "43b757f285d8", "49c4c0e12e32", "55cc6edfad7f", 
"62T", "689be0421d3c", "8237266761ca", "85d99ff60fa1", "9T", 
"a4d25b70d77c", "a74ac0179106", "ac07fd7297c8", "c0f7a7b642cd", 
"SAM3cb94b0d5297", "SAM47fc46c3d6be", "SAM4b0175e8db6e", "SAM4b7ea015fd9e", 
"SAM553c3c35b847", "SAM560f23d6a3ad", "SAM5c139c5c1c4f", "SAM5cc2d9036053", 
"SAM5cfa1699bdb7", "SAM5d989c86255e", "SAM6157c8f38b72"))

正如你在每个图中看到的,有一条水平线和一条垂直线。当然,我可以随意调整那条线的位置。

有三种颜色:黄色、蓝色和灰色。我需要每个季度每种颜色的编号。

例如,黑色素瘤图,右边的下四分之一只有一个黄点。左上角有两个黄色、一个蓝色和一个灰色。在我的真实数据中,有更多的点,这只是一个小例子。

我需要每个地块每个季度的编号。我该怎么做?

这是制作绘图的代码,如果需要,可以进行调整:

scatterplot_for_models= function(data = data, Model_1 = Model_1, Model_2 = Model_2, x = x, y = y){
        
  ggplot(data,aes(1-data[[Model_2]], data[[Model_1]], fill=stage)) +
    geom_point(size=4,pch=21) + theme_classic()+
    facet_wrap(.~cancer_type)+
    scale_fill_manual(values=c('#E69F00', '#56B4E9','#999999','#999999')) +
    xlab("Model 2") + ylab("Model 1") +
    geom_hline(yintercept = y,linetype=2)+
    geom_vline(xintercept = x,linetype=2)
  
  
}

0 回复 | 直到 2 年前

1

4

tjebo 2 年前

这是相当直接的,只要你不试图在ggplot中这样做。事先做几次数据总结,并将其用于绘图。在这种情况下,我认为最直接的方法是首先定义标签打印的坐标,然后计算每个象限的点(您需要决定将“连接”的点放置在哪里)。

library(tidyverse)

## assuming you know your cut off values
y_cut <- .16
x_cut <- .5
## basic data frame modificiation for easier coding
data <- data %>%
  mutate(
    x = 1 - Model2,
    y = Model1
  )
## first, get the label positions.
## We will define the quadrants clockwisefrom top right to top left
label_pos <-
  data %>%
  summarise(
    x_right = mean(c(x_cut, max(x))),
    x_left = mean(c(x_cut, min(x))),
    y_top = mean(c(y_cut, max(y))),
    y_bottom = mean(c(y_cut, min(y)))
  )
label_coord <- 
  data.frame(matrix(unlist(c(rep(label_pos[1:2], each = 2), c(label_pos[3:4], label_pos[4:3]))), nrow = 4))
names(label_coord) <- c("x_lab", "y_lab")
## now, summarise how many dots are per quadrant based on those cut offs
## and add the label positions
data_count <- data %>%
  mutate(quad = case_when(
    x > x_cut & y > y_cut ~ "quad1",
    x > x_cut & y < y_cut ~ "quad2",
    x < x_cut & y < y_cut ~ "quad3",
    x < x_cut & y > y_cut ~ "quad4"),
    ## convert both groups to factors
    cancer_type = as.factor(cancer_type), 
    quad = as.factor(quad)
  ) %>%
  count(cancer_type, quad, .drop = FALSE)
## now merge both
data_annot <- data_count %>%
  bind_cols(label_coord[rep(1:nrow(label_coord), 3), ])
## do the data transformation before
ggplot(mapping = aes(x, Model1)) +
  geom_point(data = data, aes(fill = stage), size = 4, pch = 21) +
  ## now you can simply use geom_text for annotation
  geom_text(data = data_annot, mapping = aes(x_lab, y_lab, label = paste("n=", n))) +
  facet_wrap(. ~ cancer_type) +
  scale_fill_manual(values = c("#E69F00", "#56B4E9", "#999999", "#999999")) +
  xlab("Model 2") +
  ylab("Model 1") +
  theme_classic() +
  geom_hline(yintercept = y_cut, linetype = 2) +
  geom_vline(xintercept = x_cut, linetype = 2)

2

I_O 2 年前

您可以访问ggplot处理的数据,直到通过提取 data 从的输出 ggplot_build(p) 哪里 p 是前者的结果 ggplot 跑示例:

ggplot_build(p)$data[[1]]

这些数据很容易制成表格,例如使用{dplyr},使用您的示例数据:

x_intercept = .5
y_intercept = .16

p <-  df |> ## save ggplot object as "p"
  ggplot(aes(1 - Model2, Model1, fill=stage)) +
  geom_point(size=4,pch=21) + theme_classic()+
  facet_wrap( ~ cancer_type)+
  scale_fill_manual(values=c('#E69F00', '#56B4E9','#999999','#999999')) +
  xlab("Model 1") + ylab("Model 2") +
  geom_hline(yintercept = y_intercept, linetype=2) +
  geom_vline(xintercept = x_intercept, linetype=2)

p ## display ggplot

将ggplot数据制成表格:

library(dplyr)

ggplot_build(p)$data[[1]] |>
                count(PANEL, ## the facet_wrap panel index (~ cancer_type)
                      fill, ## the fill (~ stage)
                      x > x_intercept, 
                      y > y_intercept
                      )

输出(截断):

+    PANEL    fill x > x_intercept y > y_intercept n
1      1 #56B4E9           FALSE            TRUE 1
2      1 #999999           FALSE           FALSE 1
3      1 #999999           FALSE            TRUE 1
4      1 #E69F00           FALSE           FALSE 1

3

1

langtang 2 年前

您可以使用 case_when() 定义每个点所在的象限,然后使用 count() ,分组依据 stage 和 cancer_type

x=0.5
y=0.16
df %>% mutate(q = case_when(
  Model2<x & Model1>y~"ul",
  Model2>=x & Model1>y~"ur",
  Model2<x & Model1<=y~"ll",
  Model2>=x & Model1<=y~"lr",
)) %>% count(stage,cancer_type,q)

4

1

TarJae 2 年前

以下是另一个使用 ggrepel :注意 df 是原始数据帧,并且 df1 仅用于标记:

工作原理:

1.创建数据帧仅用于标记目的。注意,我们必须消除所有重复!

library(tidyverse)

df1 <- df %>% 
  mutate(helper_x = ifelse(Model2 <= 0.5, "a", "b"),
         helper_y = ifelse(Model1 <= 0.16, "a", "b")) %>% 
  group_by(cancer_type, helper_x, helper_y) %>% 
  arrange(.by_group = TRUE) %>% 
  add_count(stage) %>% 
  mutate(label = paste(stage, "N=", n, sep=" ")) %>% 
  group_by(cancer_type, helper_x, helper_y, stage) %>% 
  slice(1)

2.绘图和标签:

library(tidyverse)
library(ggrepel)

df %>%
  ggplot(aes(x=1-Model2, y=Model1, color=stage))+
  geom_point(size=4, alpha=0.8)+
  facet_wrap(.~cancer_type)+
  scale_color_manual(values = c("#E69F00", "#56B4E9", "#999999", "#999999")) +
  geom_vline(xintercept = 0.5, linetype = "longdash")+
  geom_hline(yintercept = 0.16, linetype = "longdash")+
  theme_classic()+
  geom_label_repel(data=df1, aes(label = label),
                   box.padding   = 0.35, 
                   point.padding = 0.5,
                   segment.color = 'grey50',
                   show.legend = FALSE) +
  xlab("Model2")