Libraries
library(knitr)
library(tidyverse)
library(lubridate)
library(stringr)
library(jsonlite)
library(ggbeeswarm)
library(ggridges)
library(quanteda)
Data
unzip(zipfile = "../articles.zip")
articles <- read_json("articles.json")
languages <- c("German", "Spanish", "French", "Italian", "Japanese",
"Portuguese", "Chinese", "English", "Russian", "Arabic")
## aggregating article features
articles_summaries <- articles %>%
map(.f = function(article) {
comments <- map(.x = languages, .f = function(lang) {
nr_comments <- length(article$content[[lang]]$comments)
article_exists <- !is.null(article$content[[lang]])
data.frame(language = lang, nr_comments = nr_comments,
article_exists = article_exists)
}) %>%
bind_rows()
data.frame(title = article$TITLE,
link = article$link,
published = article$PUBLISHED,
created = article$CREATED,
subject = article$SUBJECT,
authortag = article$AUTHORSTAG,
author = article$AUTHORS,
# we assume first mentioned category is most important category
category = article$category[[1]][[1]]) %>%
cbind(., comments)
}) %>%
bind_rows() %>%
as_tibble() %>%
mutate(published = parse_date_time(published, orders = "%d.%m.%Y %H:%M", tz = "CET"),
created = parse_date_time(created, orders = "%d.%m.%Y %H:%M", tz = "CET"))
Descriptive statistics
## computing some summary statistics
articles_summaries %>%
group_by(language) %>%
summarise(`Number of articles` = sum(article_exists),
`Total comments` = sum(nr_comments),
`Avg number of comments per article` = round(`Total comments`/`Number of articles`, 1)) %>%
arrange(-`Avg number of comments per article`) %>%
kable()
English |
5691 |
12915 |
2.3 |
German |
1488 |
1129 |
0.8 |
French |
1462 |
776 |
0.5 |
Italian |
1346 |
563 |
0.4 |
Spanish |
2252 |
833 |
0.4 |
Arabic |
2237 |
404 |
0.2 |
Portuguese |
1766 |
319 |
0.2 |
Russian |
1719 |
264 |
0.2 |
Chinese |
2066 |
118 |
0.1 |
Japanese |
2161 |
101 |
0.0 |
articles_summaries %>%
group_by(category) %>%
summarise(`Number of articles` = sum(article_exists),
`Total comments` = sum(nr_comments),
`Avg number of comments per article` = round(`Total comments`/`Number of articles`, 1)) %>%
arrange(-`Avg number of comments per article`) %>%
kable()
Society |
2033 |
2149 |
1.1 |
Education |
649 |
657 |
1.0 |
Lifestyle |
483 |
493 |
1.0 |
Politics |
4104 |
4105 |
1.0 |
Direct democracy |
1129 |
1045 |
0.9 |
Law and order |
956 |
819 |
0.9 |
Weather |
292 |
249 |
0.9 |
Conflict |
242 |
188 |
0.8 |
Work |
565 |
470 |
0.8 |
Business |
3320 |
2443 |
0.7 |
Disaster |
320 |
226 |
0.7 |
Health |
730 |
478 |
0.7 |
NA |
1691 |
1212 |
0.7 |
Religion |
228 |
162 |
0.7 |
Environment |
872 |
515 |
0.6 |
Foreign Affairs |
209 |
126 |
0.6 |
Human interest |
453 |
281 |
0.6 |
Sci & Tech |
1477 |
783 |
0.5 |
Sport |
440 |
240 |
0.5 |
Culture |
1995 |
781 |
0.4 |
articles_summaries %>%
filter(authortag != "") %>%
group_by(authortag) %>%
summarise(`Number of articles` = sum(article_exists),
`Total comments` = sum(nr_comments),
`Avg number of comments per article` = round(`Total comments`/`Number of articles`, 1)) %>%
filter(`Number of articles` >= 15) %>%
arrange(-`Avg number of comments per article`) %>%
kable()
Julie Hunt,Kai Reusser |
15 |
40 |
2.7 |
Dale Bechtel |
35 |
66 |
1.9 |
Urs Geiser |
227 |
419 |
1.8 |
Sibilla Bondolfi |
327 |
568 |
1.7 |
Susan Misicka,Kai Reusser |
27 |
47 |
1.7 |
Jessica Davis Plüss |
128 |
202 |
1.6 |
Isobel Leybold-Johnson |
172 |
242 |
1.4 |
Kai Reusser,Sonia Fenazzi |
19 |
26 |
1.4 |
Patricia Islas |
27 |
37 |
1.4 |
Philipp Meier |
27 |
38 |
1.4 |
Alexandra Kohler |
47 |
58 |
1.2 |
Anand Chandrasekhar |
209 |
247 |
1.2 |
Armando Mombelli |
92 |
106 |
1.2 |
Marie Vuilleumier |
79 |
96 |
1.2 |
Sibilla Bondolfi,Ester Unterfinger |
39 |
48 |
1.2 |
Sonia Fenazzi |
44 |
54 |
1.2 |
Susan Misicka |
264 |
305 |
1.2 |
Thomas Stephens |
322 |
400 |
1.2 |
Balz Rigendinger |
28 |
30 |
1.1 |
Domhnall OSullivan |
171 |
196 |
1.1 |
Dominique Soguel-dit-Picard |
85 |
90 |
1.1 |
Katy Romy |
149 |
171 |
1.1 |
Alexandra Kohler,Céline Stegmüller |
16 |
16 |
1.0 |
Jo Fahy |
53 |
53 |
1.0 |
Luigi Jorio |
224 |
220 |
1.0 |
Olivier Pauchard |
136 |
142 |
1.0 |
Peter Siegenthaler |
117 |
115 |
1.0 |
Renat Kuenzi |
126 |
123 |
1.0 |
Andrea Tognina |
122 |
105 |
0.9 |
Geraldine Wong Sak Hoi |
57 |
54 |
0.9 |
Marcela Aguila RubÃn |
23 |
21 |
0.9 |
Jessica Davis Plüss,Dominique Soguel-dit-Picard |
23 |
18 |
0.8 |
Jessica Davis Plüss,Kai Reusser |
15 |
12 |
0.8 |
Jie Guo Zehnder |
32 |
24 |
0.8 |
Julia Crawford |
148 |
121 |
0.8 |
Marc-André Miserez |
108 |
87 |
0.8 |
Simon Bradley |
363 |
275 |
0.8 |
Carlo Pisani |
22 |
15 |
0.7 |
Christian Raaflaub |
147 |
108 |
0.7 |
Kathrin Ammann |
53 |
35 |
0.7 |
Larissa M. Bieler |
24 |
16 |
0.7 |
Celia Luterbacher |
60 |
39 |
0.6 |
Frédéric Burnand |
103 |
57 |
0.6 |
Samuel Jaberg |
154 |
98 |
0.6 |
Julie Hunt |
277 |
132 |
0.5 |
Matthew Allen |
435 |
210 |
0.5 |
Michele Andina |
97 |
48 |
0.5 |
Helen James |
213 |
90 |
0.4 |
Olivier Pauchard,Thomas Kern |
17 |
6 |
0.4 |
Céline Stegmüller |
178 |
55 |
0.3 |
Thomas Stephens,Kai Reusser |
25 |
8 |
0.3 |
Alexander Thoele |
16 |
4 |
0.2 |
Ester Unterfinger |
226 |
38 |
0.2 |
Thomas Kern |
100 |
21 |
0.2 |
Akiko Uehara |
49 |
4 |
0.1 |
Dahai Shao |
30 |
4 |
0.1 |
Eduardo Simantob |
37 |
5 |
0.1 |
Eduardo Simantob,Carlo Pisani |
20 |
2 |
0.1 |
Marguerite Meyer |
33 |
2 |
0.1 |
Zeno Zoccatelli |
16 |
1 |
0.1 |
Carlo Pisani,Eduardo Simantob |
92 |
1 |
0.0 |
Deganit Perez |
85 |
3 |
0.0 |
Eduardo Simantob,swissinfo |
19 |
0 |
0.0 |
Olivier Pauchard,Ester Unterfinger |
24 |
1 |
0.0 |
Plots
## distribution of comments per language
articles_summaries %>%
filter(article_exists == TRUE) %>%
ggplot(aes(x = language, y = nr_comments, color = language)) +
geom_boxplot(alpha = 0.1) +
geom_quasirandom(alpha = 0.7) +
guides(color = FALSE) +
labs(x = "Language", y = "Number of comments") +
coord_flip()

## distribution of comments per language with ridge plot
articles_summaries %>%
filter(article_exists == TRUE) %>%
ggplot(aes(x = nr_comments, y = language, group = language, fill = language)) +
geom_density_ridges2(rel_min_height = 0.001, scale = 0.95, alpha = 0.6) +
guides(fill = FALSE) +
labs(y = "Language", y = "Density of number of comments")

## distribution of comments per language and category
articles_summaries %>%
filter(article_exists == TRUE) %>%
mutate(category = fct_lump(f = category, n = 5)) %>%
ggplot(aes(x = category, y = nr_comments, color = category)) +
geom_boxplot(alpha = 0.1) +
geom_quasirandom(alpha = 0.7) +
guides(color = FALSE) +
facet_wrap(~ language, ncol = 3, scales = "free") +
labs(x = "Category", y = "Number of comments") +
coord_flip()
