Multilingual comments comparison

Libraries

library(knitr)
library(tidyverse)
library(lubridate)
library(stringr)
library(jsonlite)
library(ggbeeswarm)
library(ggridges)
library(quanteda)

Data

unzip(zipfile = "../articles.zip")
articles <- read_json("articles.json")
languages <- c("German", "Spanish", "French", "Italian", "Japanese",
               "Portuguese", "Chinese", "English", "Russian", "Arabic")

## aggregating article features
articles_summaries <- articles %>% 
  map(.f = function(article) {
    
    comments <- map(.x = languages, .f = function(lang) {
      nr_comments <- length(article$content[[lang]]$comments)
      article_exists <- !is.null(article$content[[lang]])
      data.frame(language = lang, nr_comments = nr_comments,
                 article_exists = article_exists)
    }) %>% 
      bind_rows()
    
    data.frame(title = article$TITLE,
               link = article$link,
               published = article$PUBLISHED,
               created = article$CREATED,
               subject = article$SUBJECT,
               authortag = article$AUTHORSTAG,
               author = article$AUTHORS,
               # we assume first mentioned category is most important category
               category = article$category[[1]][[1]]) %>% 
      cbind(., comments)
  }) %>% 
  bind_rows() %>% 
  as_tibble() %>% 
  mutate(published = parse_date_time(published, orders = "%d.%m.%Y %H:%M", tz = "CET"),
         created = parse_date_time(created, orders = "%d.%m.%Y %H:%M", tz = "CET"))

Descriptive statistics

## computing some summary statistics
articles_summaries %>% 
  group_by(language) %>% 
  summarise(`Number of articles` = sum(article_exists),
            `Total comments` = sum(nr_comments),
            `Avg number of comments per article` = round(`Total comments`/`Number of articles`, 1)) %>% 
  arrange(-`Avg number of comments per article`) %>% 
  kable()

language	Number of articles	Total comments	Avg number of comments per article
English	5691	12915	2.3
German	1488	1129	0.8
French	1462	776	0.5
Italian	1346	563	0.4
Spanish	2252	833	0.4
Arabic	2237	404	0.2
Portuguese	1766	319	0.2
Russian	1719	264	0.2
Chinese	2066	118	0.1
Japanese	2161	101	0.0

articles_summaries %>% 
  group_by(category) %>% 
  summarise(`Number of articles` = sum(article_exists),
            `Total comments` = sum(nr_comments),
            `Avg number of comments per article` = round(`Total comments`/`Number of articles`, 1)) %>% 
  arrange(-`Avg number of comments per article`) %>% 
  kable()

category	Number of articles	Total comments	Avg number of comments per article
Society	2033	2149	1.1
Education	649	657	1.0
Lifestyle	483	493	1.0
Politics	4104	4105	1.0
Direct democracy	1129	1045	0.9
Law and order	956	819	0.9
Weather	292	249	0.9
Conflict	242	188	0.8
Work	565	470	0.8
Business	3320	2443	0.7
Disaster	320	226	0.7
Health	730	478	0.7
NA	1691	1212	0.7
Religion	228	162	0.7
Environment	872	515	0.6
Foreign Affairs	209	126	0.6
Human interest	453	281	0.6
Sci & Tech	1477	783	0.5
Sport	440	240	0.5
Culture	1995	781	0.4

articles_summaries %>% 
  filter(authortag != "") %>% 
  group_by(authortag) %>% 
  summarise(`Number of articles` = sum(article_exists),
            `Total comments` = sum(nr_comments),
            `Avg number of comments per article` = round(`Total comments`/`Number of articles`, 1)) %>% 
  filter(`Number of articles` >= 15) %>% 
  arrange(-`Avg number of comments per article`) %>% 
  kable()

authortag	Number of articles	Total comments	Avg number of comments per article
Julie Hunt,Kai Reusser	15	40	2.7
Dale Bechtel	35	66	1.9
Urs Geiser	227	419	1.8
Sibilla Bondolfi	327	568	1.7
Susan Misicka,Kai Reusser	27	47	1.7
Jessica Davis Plüss	128	202	1.6
Isobel Leybold-Johnson	172	242	1.4
Kai Reusser,Sonia Fenazzi	19	26	1.4
Patricia Islas	27	37	1.4
Philipp Meier	27	38	1.4
Alexandra Kohler	47	58	1.2
Anand Chandrasekhar	209	247	1.2
Armando Mombelli	92	106	1.2
Marie Vuilleumier	79	96	1.2
Sibilla Bondolfi,Ester Unterfinger	39	48	1.2
Sonia Fenazzi	44	54	1.2
Susan Misicka	264	305	1.2
Thomas Stephens	322	400	1.2
Balz Rigendinger	28	30	1.1
Domhnall OSullivan	171	196	1.1
Dominique Soguel-dit-Picard	85	90	1.1
Katy Romy	149	171	1.1
Alexandra Kohler,Céline Stegmüller	16	16	1.0
Jo Fahy	53	53	1.0
Luigi Jorio	224	220	1.0
Olivier Pauchard	136	142	1.0
Peter Siegenthaler	117	115	1.0
Renat Kuenzi	126	123	1.0
Andrea Tognina	122	105	0.9
Geraldine Wong Sak Hoi	57	54	0.9
Marcela Aguila Rubín	23	21	0.9
Jessica Davis Plüss,Dominique Soguel-dit-Picard	23	18	0.8
Jessica Davis Plüss,Kai Reusser	15	12	0.8
Jie Guo Zehnder	32	24	0.8
Julia Crawford	148	121	0.8
Marc-André Miserez	108	87	0.8
Simon Bradley	363	275	0.8
Carlo Pisani	22	15	0.7
Christian Raaflaub	147	108	0.7
Kathrin Ammann	53	35	0.7
Larissa M. Bieler	24	16	0.7
Celia Luterbacher	60	39	0.6
Frédéric Burnand	103	57	0.6
Samuel Jaberg	154	98	0.6
Julie Hunt	277	132	0.5
Matthew Allen	435	210	0.5
Michele Andina	97	48	0.5
Helen James	213	90	0.4
Olivier Pauchard,Thomas Kern	17	6	0.4
Céline Stegmüller	178	55	0.3
Thomas Stephens,Kai Reusser	25	8	0.3
Alexander Thoele	16	4	0.2
Ester Unterfinger	226	38	0.2
Thomas Kern	100	21	0.2
Akiko Uehara	49	4	0.1
Dahai Shao	30	4	0.1
Eduardo Simantob	37	5	0.1
Eduardo Simantob,Carlo Pisani	20	2	0.1
Marguerite Meyer	33	2	0.1
Zeno Zoccatelli	16	1	0.1
Carlo Pisani,Eduardo Simantob	92	1	0.0
Deganit Perez	85	3	0.0
Eduardo Simantob,swissinfo	19	0	0.0
Olivier Pauchard,Ester Unterfinger	24	1	0.0

Plots

## distribution of comments per language
articles_summaries %>% 
  filter(article_exists == TRUE) %>% 
  ggplot(aes(x = language, y = nr_comments, color = language)) +
  geom_boxplot(alpha = 0.1) +
  geom_quasirandom(alpha = 0.7) + 
  guides(color = FALSE) + 
  labs(x = "Language", y = "Number of comments") +
  coord_flip()

## distribution of comments per language with ridge plot
articles_summaries %>% 
  filter(article_exists == TRUE) %>% 
  ggplot(aes(x = nr_comments, y = language, group = language, fill = language)) +
  geom_density_ridges2(rel_min_height = 0.001, scale = 0.95, alpha = 0.6) +
  guides(fill = FALSE) +
  labs(y = "Language", y = "Density of number of comments")

## distribution of comments per language and category
articles_summaries %>% 
  filter(article_exists == TRUE) %>% 
  mutate(category = fct_lump(f = category, n = 5)) %>%
  ggplot(aes(x = category, y = nr_comments, color = category)) +
  geom_boxplot(alpha = 0.1) +
  geom_quasirandom(alpha = 0.7) +
  guides(color = FALSE) + 
  facet_wrap(~ language, ncol = 3, scales = "free") +
  labs(x = "Category", y = "Number of comments") +
  coord_flip()

## time-series plot per category
articles_summaries %>% 
  mutate(category = fct_lump(f = category, n = 4)) %>%
  mutate(date = date(published)) %>% 
  group_by(date, category) %>%
  summarise(sum_comments = sum(nr_comments)) %>%
  ggplot(aes(x = date, y = sum_comments)) +
  geom_bar(stat = "identity") +
  geom_smooth(alpha = 0.5) +
  scale_y_sqrt(breaks = c(0, 2, 10, 25, 50, 100, 150, 200)) +
  facet_wrap(~ category) +
  labs(x = "Date", y = "Total number of comments")

## wordcloud plot
wordcloud_languages <- c("German", "English", "Italian", "French",
                         "Spanish", "Portuguese", "Russian",
                         "Arabic")
for (lang in wordcloud_languages) {
  print(lang)
  
  map(articles, function(article) {
    article$content[[lang]]$comments
  }) %>% 
  unlist() %>% 
  corpus() %>% 
  dfm(remove = c(stopwords(case_when(lang == "English" ~ "en", 
                                   lang == "German" ~ "de",
                                   lang == "Italian" ~ "it",
                                   lang == "French" ~ "fr",
                                   lang == "Spanish" ~ "es",
                                   lang == "Portuguese" ~ "por",
                                   lang == "Arabic" ~ "ar", 
                                   lang == "Russian" ~ "ru")), "dass"),
      remove_punct = TRUE, remove_numbers = TRUE) %>% 
  textplot_wordcloud(min_count = 6, random_order = FALSE,
                     rotation = 0.25, max_size = 8,
                     color = RColorBrewer::brewer.pal(4, "Dark2"))  
}

## [1] "German"

## [1] "English"

## [1] "Italian"

## [1] "French"

## [1] "Spanish"

## [1] "Portuguese"

## [1] "Russian"

## [1] "Arabic"

Look at outliers

articles_summaries %>% 
  filter(nr_comments >= 40) %>% 
  select(title, nr_comments, language) %>% 
  rename(Title = title,
         Comments = nr_comments,
         Language = language) %>% 
  arrange(-Comments)

## # A tibble: 9 x 3
##   Title                                                        Comments Language
##   <chr>                                                           <int> <chr>   
## 1 Minister: ‘UN aid agency is part of the problem in the Midd…       77 English 
## 2 Voters approve ‘burka ban’ in St Gallen                            60 English 
## 3 The rebirth of Ma Anand Sheela: from Rajneeshee queen to ca…       50 English 
## 4 If investors don’t overhaul banker pay, populism will              45 Spanish 
## 5 How Santander’s ditching of Andrea Orcel stunned finance in…       45 Spanish 
## 6 How to fit in at a Swiss workplace                                 43 English 
## 7 Boris Johnson: entertaining raconteur or dangerous liar?           43 English 
## 8 Dual national stripped of Swiss citizenship for first time         41 English 
## 9 Racism in Switzerland:  An expat’s perspective                     41 English

Build regression model

articles_summaries_modeldf <- articles_summaries %>% 
  filter(article_exists == TRUE) %>% 
  mutate(language = relevel(factor(language), ref = "German"),
         category = relevel(factor(category), ref = "Culture"))

## this is a misspecified model, we need to account for similarity between
## observations that come from same articles by using e.g. random effects!
## However, no time anymore for this
# library(lme4)
# fit1 <- glmer.nb(nr_comments ~ language + category + (1|title),
#                  verbose = 2, data = articles_summaries_modeldf)
fit1 <- glm(nr_comments ~ language + category, family = "quasipoisson",
            data = articles_summaries_modeldf)
coefs <- data.frame(summary(fit1)$coefficients)
data.frame(variable = rownames(coefs),
           RR = exp(coefs$Estimate),
           lower = exp(coefs$Estimate - qnorm(0.975)*coefs$Std..Error),
           upper = exp(coefs$Estimate + qnorm(0.975)*coefs$Std..Error)) %>%
  filter(variable != "(Intercept)") %>% 
  ggplot(aes(x = variable, y = RR)) +
  geom_pointrange(aes(ymin = lower, ymax = upper)) +
  geom_hline(yintercept = 1, lty = 2, alpha = 0.5) + 
  labs(x = "Variable", y = "Relative increase in number of comments",
       title = "(slightly missspecified) model with reference: German, Culture") + 
  coord_flip()

Multilingual comments comparison - Analysis

https://github.com/SamCH93/swihack-comments

Damian Murezzan, Hubert Zumwald, Samuel Pawel, Valeria Baldassarre

21.02.2020

Libraries

Data

Descriptive statistics

Plots

Look at outliers

Build regression model

Data analysis - Word vec embeddings