Libraries
library(knitr)
library(tidyverse)
library(lubridate)
library(stringr)
library(jsonlite)
library(ggbeeswarm)
library(ggridges)
library(quanteda)
Data
unzip(zipfile = "../articles.zip")
articles <- read_json("articles.json")
languages <- c("German", "Spanish", "French", "Italian", "Japanese",
"Portuguese", "Chinese", "English", "Russian", "Arabic")
## aggregating article features
articles_summaries <- articles %>%
map(.f = function(article) {
comments <- map(.x = languages, .f = function(lang) {
nr_comments <- length(article$content[[lang]]$comments)
article_exists <- !is.null(article$content[[lang]])
data.frame(language = lang, nr_comments = nr_comments,
article_exists = article_exists)
}) %>%
bind_rows()
data.frame(title = article$TITLE,
link = article$link,
published = article$PUBLISHED,
created = article$CREATED,
subject = article$SUBJECT,
authortag = article$AUTHORSTAG,
author = article$AUTHORS,
# we assume first mentioned category is most important category
category = article$category[[1]][[1]]) %>%
cbind(., comments)
}) %>%
bind_rows() %>%
as_tibble() %>%
mutate(published = parse_date_time(published, orders = "%d.%m.%Y %H:%M", tz = "CET"),
created = parse_date_time(created, orders = "%d.%m.%Y %H:%M", tz = "CET"))
Descriptive statistics
## computing some summary statistics
articles_summaries %>%
group_by(language) %>%
summarise(`Number of articles` = sum(article_exists),
`Total comments` = sum(nr_comments),
`Avg number of comments per article` = round(`Total comments`/`Number of articles`, 1)) %>%
arrange(-`Avg number of comments per article`) %>%
kable()
English |
5691 |
12915 |
2.3 |
German |
1488 |
1129 |
0.8 |
French |
1462 |
776 |
0.5 |
Italian |
1346 |
563 |
0.4 |
Spanish |
2252 |
833 |
0.4 |
Arabic |
2237 |
404 |
0.2 |
Portuguese |
1766 |
319 |
0.2 |
Russian |
1719 |
264 |
0.2 |
Chinese |
2066 |
118 |
0.1 |
Japanese |
2161 |
101 |
0.0 |
articles_summaries %>%
group_by(category) %>%
summarise(`Number of articles` = sum(article_exists),
`Total comments` = sum(nr_comments),
`Avg number of comments per article` = round(`Total comments`/`Number of articles`, 1)) %>%
arrange(-`Avg number of comments per article`) %>%
kable()
Society |
2033 |
2149 |
1.1 |
Education |
649 |
657 |
1.0 |
Lifestyle |
483 |
493 |
1.0 |
Politics |
4104 |
4105 |
1.0 |
Direct democracy |
1129 |
1045 |
0.9 |
Law and order |
956 |
819 |
0.9 |
Weather |
292 |
249 |
0.9 |
Conflict |
242 |
188 |
0.8 |
Work |
565 |
470 |
0.8 |
Business |
3320 |
2443 |
0.7 |
Disaster |
320 |
226 |
0.7 |
Health |
730 |
478 |
0.7 |
NA |
1691 |
1212 |
0.7 |
Religion |
228 |
162 |
0.7 |
Environment |
872 |
515 |
0.6 |
Foreign Affairs |
209 |
126 |
0.6 |
Human interest |
453 |
281 |
0.6 |
Sci & Tech |
1477 |
783 |
0.5 |
Sport |
440 |
240 |
0.5 |
Culture |
1995 |
781 |
0.4 |
articles_summaries %>%
filter(authortag != "") %>%
group_by(authortag) %>%
summarise(`Number of articles` = sum(article_exists),
`Total comments` = sum(nr_comments),
`Avg number of comments per article` = round(`Total comments`/`Number of articles`, 1)) %>%
filter(`Number of articles` >= 15) %>%
arrange(-`Avg number of comments per article`) %>%
kable()
Julie Hunt,Kai Reusser |
15 |
40 |
2.7 |
Dale Bechtel |
35 |
66 |
1.9 |
Urs Geiser |
227 |
419 |
1.8 |
Sibilla Bondolfi |
327 |
568 |
1.7 |
Susan Misicka,Kai Reusser |
27 |
47 |
1.7 |
Jessica Davis Plüss |
128 |
202 |
1.6 |
Isobel Leybold-Johnson |
172 |
242 |
1.4 |
Kai Reusser,Sonia Fenazzi |
19 |
26 |
1.4 |
Patricia Islas |
27 |
37 |
1.4 |
Philipp Meier |
27 |
38 |
1.4 |
Alexandra Kohler |
47 |
58 |
1.2 |
Anand Chandrasekhar |
209 |
247 |
1.2 |
Armando Mombelli |
92 |
106 |
1.2 |
Marie Vuilleumier |
79 |
96 |
1.2 |
Sibilla Bondolfi,Ester Unterfinger |
39 |
48 |
1.2 |
Sonia Fenazzi |
44 |
54 |
1.2 |
Susan Misicka |
264 |
305 |
1.2 |
Thomas Stephens |
322 |
400 |
1.2 |
Balz Rigendinger |
28 |
30 |
1.1 |
Domhnall OSullivan |
171 |
196 |
1.1 |
Dominique Soguel-dit-Picard |
85 |
90 |
1.1 |
Katy Romy |
149 |
171 |
1.1 |
Alexandra Kohler,Céline Stegmüller |
16 |
16 |
1.0 |
Jo Fahy |
53 |
53 |
1.0 |
Luigi Jorio |
224 |
220 |
1.0 |
Olivier Pauchard |
136 |
142 |
1.0 |
Peter Siegenthaler |
117 |
115 |
1.0 |
Renat Kuenzi |
126 |
123 |
1.0 |
Andrea Tognina |
122 |
105 |
0.9 |
Geraldine Wong Sak Hoi |
57 |
54 |
0.9 |
Marcela Aguila RubÃn |
23 |
21 |
0.9 |
Jessica Davis Plüss,Dominique Soguel-dit-Picard |
23 |
18 |
0.8 |
Jessica Davis Plüss,Kai Reusser |
15 |
12 |
0.8 |
Jie Guo Zehnder |
32 |
24 |
0.8 |
Julia Crawford |
148 |
121 |
0.8 |
Marc-André Miserez |
108 |
87 |
0.8 |
Simon Bradley |
363 |
275 |
0.8 |
Carlo Pisani |
22 |
15 |
0.7 |
Christian Raaflaub |
147 |
108 |
0.7 |
Kathrin Ammann |
53 |
35 |
0.7 |
Larissa M. Bieler |
24 |
16 |
0.7 |
Celia Luterbacher |
60 |
39 |
0.6 |
Frédéric Burnand |
103 |
57 |
0.6 |
Samuel Jaberg |
154 |
98 |
0.6 |
Julie Hunt |
277 |
132 |
0.5 |
Matthew Allen |
435 |
210 |
0.5 |
Michele Andina |
97 |
48 |
0.5 |
Helen James |
213 |
90 |
0.4 |
Olivier Pauchard,Thomas Kern |
17 |
6 |
0.4 |
Céline Stegmüller |
178 |
55 |
0.3 |
Thomas Stephens,Kai Reusser |
25 |
8 |
0.3 |
Alexander Thoele |
16 |
4 |
0.2 |
Ester Unterfinger |
226 |
38 |
0.2 |
Thomas Kern |
100 |
21 |
0.2 |
Akiko Uehara |
49 |
4 |
0.1 |
Dahai Shao |
30 |
4 |
0.1 |
Eduardo Simantob |
37 |
5 |
0.1 |
Eduardo Simantob,Carlo Pisani |
20 |
2 |
0.1 |
Marguerite Meyer |
33 |
2 |
0.1 |
Zeno Zoccatelli |
16 |
1 |
0.1 |
Carlo Pisani,Eduardo Simantob |
92 |
1 |
0.0 |
Deganit Perez |
85 |
3 |
0.0 |
Eduardo Simantob,swissinfo |
19 |
0 |
0.0 |
Olivier Pauchard,Ester Unterfinger |
24 |
1 |
0.0 |
Plots
## distribution of comments per language
articles_summaries %>%
filter(article_exists == TRUE) %>%
ggplot(aes(x = language, y = nr_comments, color = language)) +
geom_boxplot(alpha = 0.1) +
geom_quasirandom(alpha = 0.7) +
guides(color = FALSE) +
labs(x = "Language", y = "Number of comments") +
coord_flip()
## distribution of comments per language with ridge plot
articles_summaries %>%
filter(article_exists == TRUE) %>%
ggplot(aes(x = nr_comments, y = language, group = language, fill = language)) +
geom_density_ridges2(rel_min_height = 0.001, scale = 0.95, alpha = 0.6) +
guides(fill = FALSE) +
labs(y = "Language", y = "Density of number of comments")
## distribution of comments per language and category
articles_summaries %>%
filter(article_exists == TRUE) %>%
mutate(category = fct_lump(f = category, n = 5)) %>%
ggplot(aes(x = category, y = nr_comments, color = category)) +
geom_boxplot(alpha = 0.1) +
geom_quasirandom(alpha = 0.7) +
guides(color = FALSE) +
facet_wrap(~ language, ncol = 3, scales = "free") +
labs(x = "Category", y = "Number of comments") +
coord_flip()
## time-series plot per category
articles_summaries %>%
mutate(category = fct_lump(f = category, n = 4)) %>%
mutate(date = date(published)) %>%
group_by(date, category) %>%
summarise(sum_comments = sum(nr_comments)) %>%
ggplot(aes(x = date, y = sum_comments)) +
geom_bar(stat = "identity") +
geom_smooth(alpha = 0.5) +
scale_y_sqrt(breaks = c(0, 2, 10, 25, 50, 100, 150, 200)) +
facet_wrap(~ category) +
labs(x = "Date", y = "Total number of comments")
## wordcloud plot
wordcloud_languages <- c("German", "English", "Italian", "French",
"Spanish", "Portuguese", "Russian",
"Arabic")
for (lang in wordcloud_languages) {
print(lang)
map(articles, function(article) {
article$content[[lang]]$comments
}) %>%
unlist() %>%
corpus() %>%
dfm(remove = c(stopwords(case_when(lang == "English" ~ "en",
lang == "German" ~ "de",
lang == "Italian" ~ "it",
lang == "French" ~ "fr",
lang == "Spanish" ~ "es",
lang == "Portuguese" ~ "por",
lang == "Arabic" ~ "ar",
lang == "Russian" ~ "ru")), "dass"),
remove_punct = TRUE, remove_numbers = TRUE) %>%
textplot_wordcloud(min_count = 6, random_order = FALSE,
rotation = 0.25, max_size = 8,
color = RColorBrewer::brewer.pal(4, "Dark2"))
}
## [1] "German"
## [1] "English"
## [1] "Italian"
## [1] "French"
## [1] "Spanish"
## [1] "Portuguese"
## [1] "Russian"
## [1] "Arabic"
Look at outliers
articles_summaries %>%
filter(nr_comments >= 40) %>%
select(title, nr_comments, language) %>%
rename(Title = title,
Comments = nr_comments,
Language = language) %>%
arrange(-Comments)
## # A tibble: 9 x 3
## Title Comments Language
## <chr> <int> <chr>
## 1 Minister: ‘UN aid agency is part of the problem in the Midd… 77 English
## 2 Voters approve ‘burka ban’ in St Gallen 60 English
## 3 The rebirth of Ma Anand Sheela: from Rajneeshee queen to ca… 50 English
## 4 If investors don’t overhaul banker pay, populism will 45 Spanish
## 5 How Santander’s ditching of Andrea Orcel stunned finance in… 45 Spanish
## 6 How to fit in at a Swiss workplace 43 English
## 7 Boris Johnson: entertaining raconteur or dangerous liar? 43 English
## 8 Dual national stripped of Swiss citizenship for first time 41 English
## 9 Racism in Switzerland: An expat’s perspective 41 English
Build regression model
articles_summaries_modeldf <- articles_summaries %>%
filter(article_exists == TRUE) %>%
mutate(language = relevel(factor(language), ref = "German"),
category = relevel(factor(category), ref = "Culture"))
## this is a misspecified model, we need to account for similarity between
## observations that come from same articles by using e.g. random effects!
## However, no time anymore for this
# library(lme4)
# fit1 <- glmer.nb(nr_comments ~ language + category + (1|title),
# verbose = 2, data = articles_summaries_modeldf)
fit1 <- glm(nr_comments ~ language + category, family = "quasipoisson",
data = articles_summaries_modeldf)
coefs <- data.frame(summary(fit1)$coefficients)
data.frame(variable = rownames(coefs),
RR = exp(coefs$Estimate),
lower = exp(coefs$Estimate - qnorm(0.975)*coefs$Std..Error),
upper = exp(coefs$Estimate + qnorm(0.975)*coefs$Std..Error)) %>%
filter(variable != "(Intercept)") %>%
ggplot(aes(x = variable, y = RR)) +
geom_pointrange(aes(ymin = lower, ymax = upper)) +
geom_hline(yintercept = 1, lty = 2, alpha = 0.5) +
labs(x = "Variable", y = "Relative increase in number of comments",
title = "(slightly missspecified) model with reference: German, Culture") +
coord_flip()