Libraries

library(knitr)
library(tidyverse)
library(lubridate)
library(stringr)
library(jsonlite)
library(ggbeeswarm)
library(ggridges)
library(quanteda)

Data

unzip(zipfile = "../articles.zip")
articles <- read_json("articles.json")
languages <- c("German", "Spanish", "French", "Italian", "Japanese",
               "Portuguese", "Chinese", "English", "Russian", "Arabic")

## aggregating article features
articles_summaries <- articles %>% 
  map(.f = function(article) {
    
    comments <- map(.x = languages, .f = function(lang) {
      nr_comments <- length(article$content[[lang]]$comments)
      article_exists <- !is.null(article$content[[lang]])
      data.frame(language = lang, nr_comments = nr_comments,
                 article_exists = article_exists)
    }) %>% 
      bind_rows()
    
    data.frame(title = article$TITLE,
               link = article$link,
               published = article$PUBLISHED,
               created = article$CREATED,
               subject = article$SUBJECT,
               authortag = article$AUTHORSTAG,
               author = article$AUTHORS,
               # we assume first mentioned category is most important category
               category = article$category[[1]][[1]]) %>% 
      cbind(., comments)
  }) %>% 
  bind_rows() %>% 
  as_tibble() %>% 
  mutate(published = parse_date_time(published, orders = "%d.%m.%Y %H:%M", tz = "CET"),
         created = parse_date_time(created, orders = "%d.%m.%Y %H:%M", tz = "CET"))

Descriptive statistics

## computing some summary statistics
articles_summaries %>% 
  group_by(language) %>% 
  summarise(`Number of articles` = sum(article_exists),
            `Total comments` = sum(nr_comments),
            `Avg number of comments per article` = round(`Total comments`/`Number of articles`, 1)) %>% 
  arrange(-`Avg number of comments per article`) %>% 
  kable()
language Number of articles Total comments Avg number of comments per article
English 5691 12915 2.3
German 1488 1129 0.8
French 1462 776 0.5
Italian 1346 563 0.4
Spanish 2252 833 0.4
Arabic 2237 404 0.2
Portuguese 1766 319 0.2
Russian 1719 264 0.2
Chinese 2066 118 0.1
Japanese 2161 101 0.0
articles_summaries %>% 
  group_by(category) %>% 
  summarise(`Number of articles` = sum(article_exists),
            `Total comments` = sum(nr_comments),
            `Avg number of comments per article` = round(`Total comments`/`Number of articles`, 1)) %>% 
  arrange(-`Avg number of comments per article`) %>% 
  kable()
category Number of articles Total comments Avg number of comments per article
Society 2033 2149 1.1
Education 649 657 1.0
Lifestyle 483 493 1.0
Politics 4104 4105 1.0
Direct democracy 1129 1045 0.9
Law and order 956 819 0.9
Weather 292 249 0.9
Conflict 242 188 0.8
Work 565 470 0.8
Business 3320 2443 0.7
Disaster 320 226 0.7
Health 730 478 0.7
NA 1691 1212 0.7
Religion 228 162 0.7
Environment 872 515 0.6
Foreign Affairs 209 126 0.6
Human interest 453 281 0.6
Sci & Tech 1477 783 0.5
Sport 440 240 0.5
Culture 1995 781 0.4
articles_summaries %>% 
  filter(authortag != "") %>% 
  group_by(authortag) %>% 
  summarise(`Number of articles` = sum(article_exists),
            `Total comments` = sum(nr_comments),
            `Avg number of comments per article` = round(`Total comments`/`Number of articles`, 1)) %>% 
  filter(`Number of articles` >= 15) %>% 
  arrange(-`Avg number of comments per article`) %>% 
  kable()
authortag Number of articles Total comments Avg number of comments per article
Julie Hunt,Kai Reusser 15 40 2.7
Dale Bechtel 35 66 1.9
Urs Geiser 227 419 1.8
Sibilla Bondolfi 327 568 1.7
Susan Misicka,Kai Reusser 27 47 1.7
Jessica Davis Plüss 128 202 1.6
Isobel Leybold-Johnson 172 242 1.4
Kai Reusser,Sonia Fenazzi 19 26 1.4
Patricia Islas 27 37 1.4
Philipp Meier 27 38 1.4
Alexandra Kohler 47 58 1.2
Anand Chandrasekhar 209 247 1.2
Armando Mombelli 92 106 1.2
Marie Vuilleumier 79 96 1.2
Sibilla Bondolfi,Ester Unterfinger 39 48 1.2
Sonia Fenazzi 44 54 1.2
Susan Misicka 264 305 1.2
Thomas Stephens 322 400 1.2
Balz Rigendinger 28 30 1.1
Domhnall OSullivan 171 196 1.1
Dominique Soguel-dit-Picard 85 90 1.1
Katy Romy 149 171 1.1
Alexandra Kohler,Céline Stegmüller 16 16 1.0
Jo Fahy 53 53 1.0
Luigi Jorio 224 220 1.0
Olivier Pauchard 136 142 1.0
Peter Siegenthaler 117 115 1.0
Renat Kuenzi 126 123 1.0
Andrea Tognina 122 105 0.9
Geraldine Wong Sak Hoi 57 54 0.9
Marcela Aguila Rubín 23 21 0.9
Jessica Davis Plüss,Dominique Soguel-dit-Picard 23 18 0.8
Jessica Davis Plüss,Kai Reusser 15 12 0.8
Jie Guo Zehnder 32 24 0.8
Julia Crawford 148 121 0.8
Marc-André Miserez 108 87 0.8
Simon Bradley 363 275 0.8
Carlo Pisani 22 15 0.7
Christian Raaflaub 147 108 0.7
Kathrin Ammann 53 35 0.7
Larissa M. Bieler 24 16 0.7
Celia Luterbacher 60 39 0.6
Frédéric Burnand 103 57 0.6
Samuel Jaberg 154 98 0.6
Julie Hunt 277 132 0.5
Matthew Allen 435 210 0.5
Michele Andina 97 48 0.5
Helen James 213 90 0.4
Olivier Pauchard,Thomas Kern 17 6 0.4
Céline Stegmüller 178 55 0.3
Thomas Stephens,Kai Reusser 25 8 0.3
Alexander Thoele 16 4 0.2
Ester Unterfinger 226 38 0.2
Thomas Kern 100 21 0.2
Akiko Uehara 49 4 0.1
Dahai Shao 30 4 0.1
Eduardo Simantob 37 5 0.1
Eduardo Simantob,Carlo Pisani 20 2 0.1
Marguerite Meyer 33 2 0.1
Zeno Zoccatelli 16 1 0.1
Carlo Pisani,Eduardo Simantob 92 1 0.0
Deganit Perez 85 3 0.0
Eduardo Simantob,swissinfo 19 0 0.0
Olivier Pauchard,Ester Unterfinger 24 1 0.0

Plots

## distribution of comments per language
articles_summaries %>% 
  filter(article_exists == TRUE) %>% 
  ggplot(aes(x = language, y = nr_comments, color = language)) +
  geom_boxplot(alpha = 0.1) +
  geom_quasirandom(alpha = 0.7) + 
  guides(color = FALSE) + 
  labs(x = "Language", y = "Number of comments") +
  coord_flip()

## distribution of comments per language with ridge plot
articles_summaries %>% 
  filter(article_exists == TRUE) %>% 
  ggplot(aes(x = nr_comments, y = language, group = language, fill = language)) +
  geom_density_ridges2(rel_min_height = 0.001, scale = 0.95, alpha = 0.6) +
  guides(fill = FALSE) +
  labs(y = "Language", y = "Density of number of comments") 

## distribution of comments per language and category
articles_summaries %>% 
  filter(article_exists == TRUE) %>% 
  mutate(category = fct_lump(f = category, n = 5)) %>%
  ggplot(aes(x = category, y = nr_comments, color = category)) +
  geom_boxplot(alpha = 0.1) +
  geom_quasirandom(alpha = 0.7) +
  guides(color = FALSE) + 
  facet_wrap(~ language, ncol = 3, scales = "free") +
  labs(x = "Category", y = "Number of comments") +
  coord_flip()

## time-series plot per category
articles_summaries %>% 
  mutate(category = fct_lump(f = category, n = 4)) %>%
  mutate(date = date(published)) %>% 
  group_by(date, category) %>%
  summarise(sum_comments = sum(nr_comments)) %>%
  ggplot(aes(x = date, y = sum_comments)) +
  geom_bar(stat = "identity") +
  geom_smooth(alpha = 0.5) +
  scale_y_sqrt(breaks = c(0, 2, 10, 25, 50, 100, 150, 200)) +
  facet_wrap(~ category) +
  labs(x = "Date", y = "Total number of comments")

## wordcloud plot
wordcloud_languages <- c("German", "English", "Italian", "French",
                         "Spanish", "Portuguese", "Russian",
                         "Arabic")
for (lang in wordcloud_languages) {
  print(lang)
  
  map(articles, function(article) {
    article$content[[lang]]$comments
  }) %>% 
  unlist() %>% 
  corpus() %>% 
  dfm(remove = c(stopwords(case_when(lang == "English" ~ "en", 
                                   lang == "German" ~ "de",
                                   lang == "Italian" ~ "it",
                                   lang == "French" ~ "fr",
                                   lang == "Spanish" ~ "es",
                                   lang == "Portuguese" ~ "por",
                                   lang == "Arabic" ~ "ar", 
                                   lang == "Russian" ~ "ru")), "dass"),
      remove_punct = TRUE, remove_numbers = TRUE) %>% 
  textplot_wordcloud(min_count = 6, random_order = FALSE,
                     rotation = 0.25, max_size = 8,
                     color = RColorBrewer::brewer.pal(4, "Dark2"))  
}
## [1] "German"

## [1] "English"

## [1] "Italian"

## [1] "French"

## [1] "Spanish"

## [1] "Portuguese"

## [1] "Russian"

## [1] "Arabic"

Look at outliers

articles_summaries %>% 
  filter(nr_comments >= 40) %>% 
  select(title, nr_comments, language) %>% 
  rename(Title = title,
         Comments = nr_comments,
         Language = language) %>% 
  arrange(-Comments)
## # A tibble: 9 x 3
##   Title                                                        Comments Language
##   <chr>                                                           <int> <chr>   
## 1 Minister: ‘UN aid agency is part of the problem in the Midd…       77 English 
## 2 Voters approve ‘burka ban’ in St Gallen                            60 English 
## 3 The rebirth of Ma Anand Sheela: from Rajneeshee queen to ca…       50 English 
## 4 If investors don’t overhaul banker pay, populism will              45 Spanish 
## 5 How Santander’s ditching of Andrea Orcel stunned finance in…       45 Spanish 
## 6 How to fit in at a Swiss workplace                                 43 English 
## 7 Boris Johnson: entertaining raconteur or dangerous liar?           43 English 
## 8 Dual national stripped of Swiss citizenship for first time         41 English 
## 9 Racism in Switzerland:  An expat’s perspective                     41 English

Build regression model

articles_summaries_modeldf <- articles_summaries %>% 
  filter(article_exists == TRUE) %>% 
  mutate(language = relevel(factor(language), ref = "German"),
         category = relevel(factor(category), ref = "Culture"))

## this is a misspecified model, we need to account for similarity between
## observations that come from same articles by using e.g. random effects!
## However, no time anymore for this
# library(lme4)
# fit1 <- glmer.nb(nr_comments ~ language + category + (1|title),
#                  verbose = 2, data = articles_summaries_modeldf)
fit1 <- glm(nr_comments ~ language + category, family = "quasipoisson",
            data = articles_summaries_modeldf)
coefs <- data.frame(summary(fit1)$coefficients)
data.frame(variable = rownames(coefs),
           RR = exp(coefs$Estimate),
           lower = exp(coefs$Estimate - qnorm(0.975)*coefs$Std..Error),
           upper = exp(coefs$Estimate + qnorm(0.975)*coefs$Std..Error)) %>%
  filter(variable != "(Intercept)") %>% 
  ggplot(aes(x = variable, y = RR)) +
  geom_pointrange(aes(ymin = lower, ymax = upper)) +
  geom_hline(yintercept = 1, lty = 2, alpha = 0.5) + 
  labs(x = "Variable", y = "Relative increase in number of comments",
       title = "(slightly missspecified) model with reference: German, Culture") + 
  coord_flip()

Data analysis - Word vec embeddings