Libraries

library(knitr)
library(tidyverse)
library(lubridate)
library(stringr)
library(jsonlite)
library(ggbeeswarm)
library(ggridges)
library(quanteda)

Data

unzip(zipfile = "../articles.zip")
articles <- read_json("articles.json")
languages <- c("German", "Spanish", "French", "Italian", "Japanese",
               "Portuguese", "Chinese", "English", "Russian", "Arabic")

## aggregating article features
articles_summaries <- articles %>% 
  map(.f = function(article) {
    
    comments <- map(.x = languages, .f = function(lang) {
      nr_comments <- length(article$content[[lang]]$comments)
      article_exists <- !is.null(article$content[[lang]])
      data.frame(language = lang, nr_comments = nr_comments,
                 article_exists = article_exists)
    }) %>% 
      bind_rows()
    
    data.frame(title = article$TITLE,
               link = article$link,
               published = article$PUBLISHED,
               created = article$CREATED,
               subject = article$SUBJECT,
               authortag = article$AUTHORSTAG,
               author = article$AUTHORS,
               # we assume first mentioned category is most important category
               category = article$category[[1]][[1]]) %>% 
      cbind(., comments)
  }) %>% 
  bind_rows() %>% 
  as_tibble() %>% 
  mutate(published = parse_date_time(published, orders = "%d.%m.%Y %H:%M", tz = "CET"),
         created = parse_date_time(created, orders = "%d.%m.%Y %H:%M", tz = "CET"))

Descriptive statistics

## computing some summary statistics
articles_summaries %>% 
  group_by(language) %>% 
  summarise(`Number of articles` = sum(article_exists),
            `Total comments` = sum(nr_comments),
            `Avg number of comments per article` = round(`Total comments`/`Number of articles`, 1)) %>% 
  arrange(-`Avg number of comments per article`) %>% 
  kable()
language Number of articles Total comments Avg number of comments per article
English 5691 12915 2.3
German 1488 1129 0.8
French 1462 776 0.5
Italian 1346 563 0.4
Spanish 2252 833 0.4
Arabic 2237 404 0.2
Portuguese 1766 319 0.2
Russian 1719 264 0.2
Chinese 2066 118 0.1
Japanese 2161 101 0.0
articles_summaries %>% 
  group_by(category) %>% 
  summarise(`Number of articles` = sum(article_exists),
            `Total comments` = sum(nr_comments),
            `Avg number of comments per article` = round(`Total comments`/`Number of articles`, 1)) %>% 
  arrange(-`Avg number of comments per article`) %>% 
  kable()
category Number of articles Total comments Avg number of comments per article
Society 2033 2149 1.1
Education 649 657 1.0
Lifestyle 483 493 1.0
Politics 4104 4105 1.0
Direct democracy 1129 1045 0.9
Law and order 956 819 0.9
Weather 292 249 0.9
Conflict 242 188 0.8
Work 565 470 0.8
Business 3320 2443 0.7
Disaster 320 226 0.7
Health 730 478 0.7
NA 1691 1212 0.7
Religion 228 162 0.7
Environment 872 515 0.6
Foreign Affairs 209 126 0.6
Human interest 453 281 0.6
Sci & Tech 1477 783 0.5
Sport 440 240 0.5
Culture 1995 781 0.4
articles_summaries %>% 
  filter(authortag != "") %>% 
  group_by(authortag) %>% 
  summarise(`Number of articles` = sum(article_exists),
            `Total comments` = sum(nr_comments),
            `Avg number of comments per article` = round(`Total comments`/`Number of articles`, 1)) %>% 
  filter(`Number of articles` >= 15) %>% 
  arrange(-`Avg number of comments per article`) %>% 
  kable()
authortag Number of articles Total comments Avg number of comments per article
Julie Hunt,Kai Reusser 15 40 2.7
Dale Bechtel 35 66 1.9
Urs Geiser 227 419 1.8
Sibilla Bondolfi 327 568 1.7
Susan Misicka,Kai Reusser 27 47 1.7
Jessica Davis Plüss 128 202 1.6
Isobel Leybold-Johnson 172 242 1.4
Kai Reusser,Sonia Fenazzi 19 26 1.4
Patricia Islas 27 37 1.4
Philipp Meier 27 38 1.4
Alexandra Kohler 47 58 1.2
Anand Chandrasekhar 209 247 1.2
Armando Mombelli 92 106 1.2
Marie Vuilleumier 79 96 1.2
Sibilla Bondolfi,Ester Unterfinger 39 48 1.2
Sonia Fenazzi 44 54 1.2
Susan Misicka 264 305 1.2
Thomas Stephens 322 400 1.2
Balz Rigendinger 28 30 1.1
Domhnall OSullivan 171 196 1.1
Dominique Soguel-dit-Picard 85 90 1.1
Katy Romy 149 171 1.1
Alexandra Kohler,Céline Stegmüller 16 16 1.0
Jo Fahy 53 53 1.0
Luigi Jorio 224 220 1.0
Olivier Pauchard 136 142 1.0
Peter Siegenthaler 117 115 1.0
Renat Kuenzi 126 123 1.0
Andrea Tognina 122 105 0.9
Geraldine Wong Sak Hoi 57 54 0.9
Marcela Aguila Rubín 23 21 0.9
Jessica Davis Plüss,Dominique Soguel-dit-Picard 23 18 0.8
Jessica Davis Plüss,Kai Reusser 15 12 0.8
Jie Guo Zehnder 32 24 0.8
Julia Crawford 148 121 0.8
Marc-André Miserez 108 87 0.8
Simon Bradley 363 275 0.8
Carlo Pisani 22 15 0.7
Christian Raaflaub 147 108 0.7
Kathrin Ammann 53 35 0.7
Larissa M. Bieler 24 16 0.7
Celia Luterbacher 60 39 0.6
Frédéric Burnand 103 57 0.6
Samuel Jaberg 154 98 0.6
Julie Hunt 277 132 0.5
Matthew Allen 435 210 0.5
Michele Andina 97 48 0.5
Helen James 213 90 0.4
Olivier Pauchard,Thomas Kern 17 6 0.4
Céline Stegmüller 178 55 0.3
Thomas Stephens,Kai Reusser 25 8 0.3
Alexander Thoele 16 4 0.2
Ester Unterfinger 226 38 0.2
Thomas Kern 100 21 0.2
Akiko Uehara 49 4 0.1
Dahai Shao 30 4 0.1
Eduardo Simantob 37 5 0.1
Eduardo Simantob,Carlo Pisani 20 2 0.1
Marguerite Meyer 33 2 0.1
Zeno Zoccatelli 16 1 0.1
Carlo Pisani,Eduardo Simantob 92 1 0.0
Deganit Perez 85 3 0.0
Eduardo Simantob,swissinfo 19 0 0.0
Olivier Pauchard,Ester Unterfinger 24 1 0.0

Plots

## distribution of comments per language
articles_summaries %>% 
  filter(article_exists == TRUE) %>% 
  ggplot(aes(x = language, y = nr_comments, color = language)) +
  geom_boxplot(alpha = 0.1) +
  geom_quasirandom(alpha = 0.7) + 
  guides(color = FALSE) + 
  labs(x = "Language", y = "Number of comments") +
  coord_flip()

## distribution of comments per language with ridge plot
articles_summaries %>% 
  filter(article_exists == TRUE) %>% 
  ggplot(aes(x = nr_comments, y = language, group = language, fill = language)) +
  geom_density_ridges2(rel_min_height = 0.001, scale = 0.95, alpha = 0.6) +
  guides(fill = FALSE) +
  labs(y = "Language", y = "Density of number of comments") 

## distribution of comments per language and category
articles_summaries %>% 
  filter(article_exists == TRUE) %>% 
  mutate(category = fct_lump(f = category, n = 5)) %>%
  ggplot(aes(x = category, y = nr_comments, color = category)) +
  geom_boxplot(alpha = 0.1) +
  geom_quasirandom(alpha = 0.7) +
  guides(color = FALSE) + 
  facet_wrap(~ language, ncol = 3, scales = "free") +
  labs(x = "Category", y = "Number of comments") +
  coord_flip()