我正在研究一個在線評論數據集,并使用R的stm包來分析數據。
我讀過一篇文章,其中說明了如何使用stm包來研究text-based數據,我發現它非常有用。特別是,我認為下面從帖子中截取的圖形可能對我的研究有用。
這篇文章只展示了這張圖的部分代碼。
topic_labels <- tribble(
~topic, ~category, ~color,
list(4, 8, 24, 50), "Anthropology/History", "#d4ae0b",
list(44, 48), "Journalism/Satire", "#3E7D49",
list(9, 17, 27, 30, 32), "Philosophy", "#c73200",
list(2, 3, 5, 11, 13, 15, 18, 20, 26, 28, 33, 34, 35, 39, 43), "Politics", "#de860b",
list(14, 23, 29, 31, 36, 37, 38, 42, 45, 47), "Political Economy", "#6F8FCF",
list(21, 25, 41), "Military", "#b7a4d6",
list(1, 6, 7, 10, 12, 16, 19, 22, 40), "Sociology", "#8f1f3f",
list(46, 49), "Science/Math", "#767676") %>%
unnest(topic) %>%
unnest(topic) %>%
mutate(topic = factor(topic))
我嘗試使用自己的數據集生成代碼,但失敗了。這是我的代碼:
library(stm)
library(tidyverse)
library(ggplot2)
# Step 1: Create a mapping of topics to categories
topic_categories <- c(
"Price" = "11,21,25,7",
"Services" = "1,9,5",
"Environment" = "18,13,24,3",
"Hygiene" = "26,19,4,23",
"Personnel" = "20,6,15",
"Values" = "14,17,22",
"Perception" = "2,16,8,12",
"Others" = "10"
)
# Step 2: Create a data frame with topic probabilities
topic_probabilities <- colMeans(stm26$theta)
topic_data <- data.frame(
topic = 1:length(topic_probabilities),
probability = topic_probabilities
)
# Step 3: Assign categories to topics
topic_data$category <- NA
for (cat in names(topic_categories)) {
topics <- as.numeric(strsplit(topic_categories[cat], ",")[[1]])
topic_data$category[topic_data$topic %in% topics] <- cat
}
# Step 4: Get top words for each topic
top_words <- labelTopics(stm26, n = 2)
topic_data$top_words <- apply(top_words$prob, 1, function(x) paste(x, collapse = ", "))
# Step 5: Assign colors to categories
category_colors <- c(
"Price" = "#8f1f3f",
"Services" = "#d4ae0b",
"Environment" = "#de860b",
"Hygiene" = "#6F8FCF",
"Personnel" = "#c73200",
"Values" = "#b7a4d6",
"Perception" = "#3E7D49",
"Others" = "#767676"
)
# Step 6: Create the plot
ggplot(topic_data, aes(y = reorder(topic, probability), x = probability, color = category)) +
geom_segment(aes(x = 0, xend = probability, yend = reorder(topic, probability)), size = 0.5) +
geom_point(size = 1) +
geom_text(aes(label = top_words), hjust = 0, nudge_x = 0.002, size = 3) +
scale_color_manual(values = category_colors) +
scale_x_continuous(labels = scales::percent_format(accuracy = 1),
limits = c(0, 0.18)) +
geom_vline(xintercept = seq(0.05, 0.15, by = 0.05), color = "lightgrey") +
facet_grid(category ~ ., scales = "free_y", space = "free_y", switch = "y") +
theme_minimal() +
theme(
axis.title.y = element_blank(),
axis.text.y = element_text(face = "bold", size = 8, color = "black", margin = margin(r = -25)),
axis.ticks.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.position = "none",
strip.placement = "outside",
strip.text.y.left = element_text(angle = 0, hjust = 1, face = "bold"),
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, face = "italic"),
plot.margin = margin(5.5, 40, 5.5, 5.5) # Adjusted right margin to prevent text cutoff
) +
labs(
title = "Title",
subtitle = "Subtitle",
x = "Expected topic probability"
)
結果與帖子的圖形有點不同。
首先,我無法將類別名稱移動到主題行的頂部。其次,網格線(灰色)是間歇性的,不連續的。
有人知道如何修改代碼以生成類似于帖子中顯示的圖形嗎?
這里有一個選項,通過切換到
ggforce::facet_col
將條帶文本移動到頂部來實現所需的結果,使用ggtext
為條帶文本著色,最后使用annotation_custom
和clip="off"
獲得連續的網格線,沒有中斷。使用一些虛假的隨機示例數據: