Systematic Literature Review


Olivier Caron

Paris Dauphine - PSL

Christophe Benavent

Paris Dauphine - PSL


December 13, 2023

1 Introduction



df <- read_delim("nlp_full_data_final_18-08-2023.csv",delim = ";", 
                 escape_double = FALSE, trim_ws = TRUE) %>%
  select(1,5,7,8, 14, 15,16,21,23,32,33,39,40,51,52,53) %>%
  group_by(entry_number) %>%
  filter(row_number()==1) %>%
  rename(id=1,title =8, review=9, text=10, keywords =13)

df$review2<- ifelse(str_detect(df$review,"Proceedings")==TRUE, "out", df$review)
df$review2<- ifelse(str_detect(df$review2, "Conference")==TRUE, "out", df$review2)
df$review2<- ifelse(str_detect(df$review2, "Transactions")==TRUE, "out", df$review2)

# for the networks
list_articles <- read_csv2("nlp_full_data_final_18-08-2023.csv") %>%
  select(1,5,7,8, 14, 15,16,21,23,32,33,39,40,51,52,53) %>%
  rename(id=1,title =8, review=9, text=10, keywords =13)

list_articles$review2<- ifelse(str_detect(list_articles$review,"Proceedings")==TRUE, "out", list_articles$review)
list_articles$review2<- ifelse(str_detect(list_articles$review2, "Conference")==TRUE, "out", list_articles$review2)
list_articles$review2<- ifelse(str_detect(list_articles$review2, "Transactions")==TRUE, "out", list_articles$review2)
list_articles <- list_articles %>% filter(review2 !="out")
import community as community_louvain
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import as px
import re

from ipysigma import Sigma, SigmaGrid
from import Network
from itertools import combinations
from datetime import datetime
from functools import partial

list_articles = r.list_articles

2 Description

df<- df%>%filter(review2 !="out")
t0 <$review2))%>%

g01 <- ggplot(t0, aes(x=reorder(Var1, Freq), y=Freq)) +
  geom_bar(stat="identity", fill="steelblue") +
  coord_flip() +
  theme_minimal() +
  labs(title="Number of Articles per Journal", y="Proportion", x="") +
    axis.title.y = element_blank(),
    axis.text.y = element_text(size = 10)


# Graph 2: Number of publications per year
g02 <- ggplot(t1, aes(x=Var1, y=Freq, group=1)) +
  geom_line(size=1.1, color="steelblue") +
  geom_point(size=2, color="steelblue") +
  geom_smooth(color="#7D7C7C", linewidth=0.5)+
  theme_minimal() +
  labs(title="Number of Publications per Year", y="", x="Year") +
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
    axis.title.x = element_blank(),
    plot.title = element_text(hjust = 0.5)

plot_grid(g01, g02, labels = c('A', 'B'), label_size = 8, ncol=2,  rel_widths =  c(1,1))

plotgrid <- plot_grid(g01, g02, labels = c('A', 'B'), label_size = 8, ncol=2,  rel_widths =  c(1,1))

ggsave(filename="./images/quant.jpeg", plotgrid, width = 40, height = 18, units = "cm")

3 Keywords

user2<- df %>% 
  select(id, keywords)%>% 
  separate(keywords, sep="\\|", into=c("A1","A2","A3","A4","A5","A6","A7","A8","A9","A10", "A11", "A12", "A13", "A14", "A15", "A16"))%>%
  pivot_longer(-id, names_to="Rang", values_to = "keywords") %>%

test_user2 <- df %>% 
  select(id, keywords) %>%
  mutate(keywords = tolower(keywords)) %>%
  cSplit('keywords', sep = "|", direction = "wide") %>%
  pivot_longer(-id, names_to = "Rang", values_to = "keywords", values_drop_na = TRUE)

user2$keywords= str_trim(user2$keywords,side ="both")

user2$keywords[user2$keywords=="artificial intelligence (ai)"]<-"ai"
user2$keywords[user2$keywords=="automated analysis of text"]<-"automated text analysis"
user2$keywords[user2$keywords=="automated textual analysis"]<-"automated text analysis"
user2$keywords[user2$keywords=="natural language processinf"]<-"nlp"
user2$keywords[user2$keywords=="natural language processing (nlp)"]<-"nlp"
user2$keywords[user2$keywords=="natural language processing (nlp)-based approach"]<-"nlp"
user2$keywords[user2$keywords=="nlp tools"]<-"nlp"
user2$keywords[user2$keywords=="natural language processing"]<-"nlp"
user2$keywords[user2$keywords=="online review"]<-"online reviews"
user2$keywords[user2$keywords=="online shopping review"]<-"online reviews"
user2$keywords[user2$keywords=="review"]<-"online reviews"
user2$keywords[user2$keywords=="reviews"]<-"online reviews"
user2$keywords[user2$keywords=="topic modelling"]<-"topic modeling"
user2$keywords[user2$keywords=="user generated content"]<-"user-generated content"

# compter les mots cles
foo<- user2 %>%
  mutate(n=1) %>%

foo1<- foo %>%

ggplot(foo1, aes(label = keywords, size = m)) +
  geom_text_wordcloud() +
  scale_size_area(max_size = 10) +

ggsave(filename="./images/keywords01.jpeg", plot=last_plot(), width = 27, height = 18, units = "cm")

# tsne projection

foo2<-user2 %>%
  summarize(n=n()) %>% 
  pivot_wider(id,names_from = "keywords", values_from ="n" )

foo2<- foo2 %>% 
  replace(,0)  %>%

tsne_out <- Rtsne(foo3,perplexity = 10, dim=2,  check_duplicates = FALSE) # Run TSNE
tsne_out3<-cbind(tsne_out2, keywords) 
tsne_out3<- merge(tsne_out3,foo1)
tsneplot <- tsne_out3%>% 
  filter(m>0) %>%
  ggplot(aes(x=V1, y=V2, label=keywords))+
  geom_point(aes(size=m), alpha=.5)+
  geom_text_repel(aes(label=ifelse(m>1,keywords,""),size=log(m)),  max.overlaps =50)+
  scale_size_continuous(range = c(2, 10))

ggsave(filename="./images/keywords.jpeg", tsneplot, width = 27, height = 18, units = "cm")

3.1 With a breakdown of the keywords

The strategy is to reduce the morphologies of the most frequent keywords, in the form of unigrams, and then re-decompose the terms that make up the non-recoded keywords.

key1<- df %>% 
  select(id, keywords)%>% 
  separate(keywords, sep="\\|", into=c("A1","A2","A3","A4","A5","A6","A7","A8","A9","A10", "A11", "A12", "A13", "A14", "A15", "A16"))%>%
  pivot_longer(-id, names_to="Rang", values_to = "keywords") %>%
  filter(! %>%
         keywords=str_replace(keywords, "text mining", "textmining"),
         keywords=str_replace(keywords, "text-mining", "textmining"),
         keywords=str_replace(keywords, "text analysis", "textanalysis"),
         keywords=str_replace(keywords, "text-analysis", "textanalysis"),
         keywords=str_replace(keywords, "user-generated content", "ugc"),
         keywords=str_replace(keywords, "user-generated content-ugc", "ugc"),
         keywords=str_replace(keywords, "user generated content", "ugc"),
         keywords=str_replace(keywords, "user genrated content (ugc)", "ugc"),
         keywords=str_replace(keywords, "artificial intelligence", "ai"),
         keywords=str_replace(keywords, "artificial intelligence", "ai"),
         keywords=str_replace(keywords, "artificial intelligence (ai)", "ai"),
         keywords=str_replace(keywords, "artificial neural network", "ai"),
         keywords=str_replace(keywords, "artificial neural networks", "neural network"),
         keywords=str_replace(keywords, "natural language processing", "nlp"),
         keywords=str_replace(keywords, "natural language processing (nlp)", "nlp"),
         keywords=str_replace(keywords, "natural language processing (nlp)-based approach", "nlp"),
         keywords=str_replace(keywords, "natural language processing (nlp)-based approach", "nlp"),
         keywords=str_replace(keywords, "topic model analysis", "topics"),
         keywords=str_replace(keywords, "topic modeling", "topics"),
         keywords=str_replace(keywords, "topic modelling", "topics"),
         keywords=str_replace(keywords, "topic model", "topics"),
         keywords=str_replace(keywords, "e wom", "ewom"),
         keywords=str_replace(keywords, "e-wom", "ewom"),
         keywords=str_replace(keywords, "electronic word-of-mouth", "ewom"),
         keywords=str_replace(keywords, "electronic word of mouth", "ewom"),
         keywords=str_replace(keywords, "electronic word of mouth (ewom)", "ewom"),
         keywords=str_replace(keywords, "online word-of-mouth", "ewom"),
         keywords=str_replace(keywords, "online word of mouth", "ewom"),
         keywords=str_replace(keywords, "online word of mouth", "ewom"),
         keywords=str_replace(keywords, "negative word-of-mouth  nwom", "ewom"),
         keywords=str_replace(keywords, "word-of-mouth communication", "ewom"),
         keywords=str_replace(keywords, "word-of-mouth", "ewom"),
         keywords=str_replace(keywords, "online review", "reviews"),
         keywords=str_replace(keywords, "online reviews", "reviews"),
         keywords=str_replace(keywords, "social media", "socialmedia"),
         keywords=str_replace(keywords, "latent dirichlet allocation", "lda"),
         keywords=str_replace(keywords, "latent dirichlet allocation (lda)", "lda"),
         keywords=str_replace(keywords, "latent dirichlet allocation algorithm", "lda"),
         keywords=str_replace(keywords, "word embedding", "embedding"),
         keywords=str_replace(keywords, "word2vec", "embedding"),
         keywords=str_replace(keywords, "latent dirichlet allocation model", "lda"),
         keywords=str_replace(keywords, "machine learning", "ml"),         
         keywords=str_replace(keywords, "machine learning and linguistic analysis", "ml"),
         keywords=str_replace(keywords, "big data", "bigdata"),
         keywords=str_replace(keywords, "liwc", "liwc"),
         keywords=str_replace(keywords, "linguistic inquiry and word count liwc", "liwc"),
         keywords=str_replace(keywords, "bibliometrics", "bibliometric"),
         keywords=str_replace(keywords, "\\(", ""),
         keywords=str_replace(keywords, "\\)", "")
foo<-key1%>% group_by(keywords)%>%
# A tibble: 1,151 × 2
   keywords                                 n
   <chr>                                <int>
 1 10-k reports                             1
 2 abstract vs. concrete mindsets           1
 3 accommodation                            1
 4 accommodative and defensive response     1
 5 acoustic                                 1
 6 actionable insights                      1
 7 adaptive clothing                        1
 8 advanced analytics                       1
 9 advertising                              4
10 advertising slogans                      1
# ℹ 1,141 more rows
key2<- key1 %>% 
  select(-Rang) %>%
    separate(keywords, sep=" ", into=c("A1","A2","A3","A4","A5","A6","A7","A8","A9","A10", "A11", "A12", "A13", "A14", "A15", "A16")) %>%
  pivot_longer(-id, names_to="Rang", values_to = "keywords")  %>% 
  mutate(n=nchar(keywords)) %>%
  filter(n>0) %>%select(-n)


foo2<-key2 %>%
  summarize(m=n()) %>% 
  pivot_wider(id,names_from = "keywords", values_from ="m", names_repair= "unique" )

foo2<- foo2 %>% 
  replace(,0)  %>%

tsne_out <- Rtsne(foo3,perplexity = 25, dim=2,  check_duplicates = FALSE) # Run TSNE
tsne_out3<-cbind(tsne_out2, keywords) 
tsne_out3<- merge(tsne_out3,key3)

  ggplot(aes(x=V1, y=V2, label=keywords))+
  geom_text_repel(aes(label=ifelse(k>4,keywords,""),size=k), max.overlaps =50)+
  scale_size_continuous(range = c(1, 10))

ggsave(filename="./images/keywords03.jpeg", plot=last_plot(), width = 27, height = 18, units = "cm")
foo<- df%>%select(id,year)
foo5<-key2 %>%
  summarize(m=n()) %>% 
  left_join(foo) %>%
  filter(keywords=="ai"| keywords=="textmining"| keywords=="nlp" | keywords=="topics" | 
           keywords=="lda"| keywords=="ml"|keywords=="embbeding" |keywords=="liwc")

ggplot(foo5, aes(x=year, y=m, group=keywords))+
  scale_fill_manual(values = rev(wes_palette("Zissou1", n = length(unique(foo5$keywords)), type = "continuous")))+ #nice palette
  scale_x_continuous(breaks = unique(foo5$year)) #show every year on x axis

foo5<-key2 %>%
  summarize(m=n()) %>% 
  left_join(foo) %>%
  filter(keywords=="ewom"| keywords=="reviews"| keywords=="socialmedia" | keywords=="ugc"| keywords=="bibliometric")

ggplot(foo5, aes(x=year, y=m, group=keywords))+
  scale_fill_manual(values = rev(wes_palette("Zissou1", n = length(unique(foo5$keywords)), type = "continuous")))+ 
  scale_x_continuous(breaks = unique(foo5$year)) #show every year on x axis

4 Structural topic model (STM)

4.1 Data preparation

df$Texte<-paste(df$title, " . ", df$text, df$keywords)

                ifelse(df$year>2010 & df$year<2016,2015,
                       ifelse(df$year>2015 & df$year<2020,2019,df$year)))%>%

2010 2015 2019 2020 2021 2022 2023 
  15   32   66   43   60   78   75 
corpus <- quanteda::corpus(
            text_field = "Texte",
            docid_field = "id",
            unique_docnames = FALSE

corpus_dfm <-
      quanteda::dfm(corpus) %>%
      # tokens_remove = 'nan',
      # Get rid of any word that doesn't occur at least x
       min_termfreq = 5,
       # Get rid of any word that is in at least x per cent of documents
       max_docfreq = 0.9,
       # Below we specify percentages - we can specify counts or ranks
       docfreq_type = "prop"
     ) %>%
      # '[a-zA-Z]',         # filter out terms without alpha
      # valuetype='regex',
      min_nchar = 2)        # Only words with at least 3 characters

out <- quanteda::convert(corpus_dfm, to = 'stm')

K = 16 # number of topics

FORMULA_RANGE <-paste("1",as.character(K), sep=":")
PREVALENCE_FORMULA <- as.formula(paste("~", "year"))
PREVALENCE_FORMULA_EFFECT <- as.formula( paste(FORMULA_RANGE, paste("~ ", "year",  collapse="+") ) )

OUTPUT_FILE <- sprintf("STM_model_%s.RData", K)


4.2 Estimation

stm_model <- stm::stm(
                        documents = out$documents, 
                        vocab = out$vocab,
                        data = out$meta,
                        K = K, 
                        prevalence = PREVALENCE_FORMULA,
                        verbose = TRUE, # show progress
                        init.type = "Spectral",
                        seed = 56,
                        emtol = 1e-05,
stm_effects <- stm::estimateEffect(PREVALENCE_FORMULA_EFFECT,
      stmobj = stm_model, metadata = out$meta)

saveRDS(stm_model, "stm_model.rds")
saveRDS(stm_effects, "stm_effects.rds")


labelTopics(stm_model, topics = NULL, n = 5, frexweight = 0.5)
Topic 1 Top Words:
     Highest Prob: research, this, analysis, consumer, marketing 
     FREX: journal, articles, published, journals, future 
     Lift: emic, etic, gamification, industrial-buying, materialism 
     Score: overview, journal, bibliometric, journals, articles 
Topic 2 Top Words:
     Highest Prob: service, reviews, online, quality, this 
     FREX: service, quality, hotel, m-banking, restaurants 
     Lift: sensorial, servicescape, cse, friendly, m-banking 
     Score: sb, service, m-banking, hotel, o2o 
Topic 3 Top Words:
     Highest Prob: content, that, consumers, this, more 
     FREX: privacy, advertising, volatility, photos, content 
     Lift: cookies, narrativity, slogans, sandbox, volatility 
     Score: slogans, volatility, privacy, sandbox, photos 
Topic 4 Top Words:
     Highest Prob: consumers, this, study, that, consumer 
     FREX: customization, regulatory, fit, sustainable, cues 
     Lift: variety-seeking, customization, korean, scboei, vulnerability 
     Score: customization, variety-seeking, regulatory, vulnerability, fit 
Topic 5 Top Words:
     Highest Prob: we, that, for, text, consumer 
     FREX: models, we, dictionary, our, nostalgia 
     Lift: camel, fictitious, cgas, nostalgia, machines 
     Score: nostalgia, camel, milk, loan, memory 
Topic 6 Top Words:
     Highest Prob: reviews, product, online, on, review 
     FREX: product, products, reviews, review, amazon 
     Lift: numeric, votes, seekers, agricultural, ux 
     Score: agricultural, helpfulness, review, product, prices 
Topic 7 Top Words:
     Highest Prob: customer, reviews, for, that, online 
     FREX: customer, needs, lobbying, satisfaction, participation 
     Lift: adaptive, boxes, freelancers, lobbying, dissatisfied 
     Score: boxes, lobbying, needs, participation, customer 
Topic 8 Top Words:
     Highest Prob: study, that, from, on, is 
     FREX: green, tourism, tourists, destination, qualitative 
     Lift: ocr, recommenders, word-of-machine, gcb, green 
     Score: green, 酒店, psychophenomenology, 绿色, tourists 
Topic 9 Top Words:
     Highest Prob: that, on, this, satisfaction, is 
     FREX: airbnb, hosts, guest, investment, intangible 
     Lift: cmo, incentive, investments, positivity, synopsis 
     Score: ego-drive, hosts, airbnb, constraint, investment 
Topic 10 Top Words:
     Highest Prob: social, media, analysis, this, for 
     FREX: social, media, facebook, twitter, covid-19 
     Lift: anorexia, brand’s, e-wom, obamacare, sadness 
     Score: problematic, arabic, media, twitter, facebook 
Topic 11 Top Words:
     Highest Prob: that, on, is, language, for 
     FREX: macau, ibs, websites, al-shariah, maqasid 
     Lift: chatgpt, dis, ibs, ma, maqasid 
     Score: religious, macau, ibs, al-shariah, maqasid 
Topic 12 Top Words:
     Highest Prob: brand, brands, is, online, this 
     FREX: brand, brands, luxury, celebrity, associations 
     Lift: company-defined, hashtags, attachment, celebrity, co-branding 
     Score: attachment, brand, celebrity, luxury, fashion 
Topic 13 Top Words:
     Highest Prob: text, analysis, for, data, on 
     FREX: market, segmentation, technology, emphasis, semantic 
     Lift: b2c-sis, collectors, connected-home, enforcement, eo 
     Score: enforcement, training, b2b-sis, educational, emphasis 
Topic 14 Top Words:
     Highest Prob: marketing, data, text, for, this 
     FREX: analytics, marketing, orientation, artificial, marketers 
     Lift: 10-k, blocks, mcdonald's, pig, rapport 
     Score: excellence, orientation, telemarketing, c-suite, entrepreneurial 
Topic 15 Top Words:
     Highest Prob: on, engagement, that, media, social 
     FREX: engagement, live, idea, affective, personal 
     Lift: diaspora, gaze, learners, stereotypes, voluntary 
     Score: stereotypes, engagement, live, short-form, learners 
Topic 16 Top Words:
     Highest Prob: ewom, web, for, text, online 
     FREX: ewom, web, sites, electronic, internet 
     Lift: seafood, emerald, ewom, sites, triggers 
     Score: seafood, ewom, sites, complaint, web 

4.3 Topics’ description (beta)

4 indicators: probability, exclusivity, resulting frex, lift and score.

\(FREX = \left(\frac{w}{F} + \frac{1-w}{E}\right)^{-1}\)

see lift:

\(Lift = \beta_{k,v}/(w_v/\sum_v w_v)\)

the score:

\(\beta_{v, k} (\log \beta_{w,k} - 1 / K \sum_{k'} \log \beta_{v,k'})\)

beta<-t(stm_model$beta$logbeta[[1]]) %>%


vocab<-stm_model$vocab %>%

vocab<- vocab %>% dplyr::rename(feature= 1)


beta2<-cbind(vocab, beta) %>%
  pivot_longer(-feature, names_to="topic", values_to = "logprob") %>%
         p=exp(logprob)) %>%

#labelisation des topics

beta3<-calcfrex(logbeta, w = 0.25, wordcounts = NULL)  

beta4<-cbind(vocab, beta3) %>% %>%
  pivot_longer(-feature, names_to="topic2", values_to = "frex") %>%

beta2<- beta2 %>% 
  left_join(beta4, by=c("feature", "topic"))

synth<- labelTopics(stm_model, topics = NULL, n = 5, frexweight = 0.25)

#intitulés suggérés par Bard

beta2$topic[beta2$topic=="V1"]<-"T01 Analyse bibliométrique\ndes recherches marketing"
beta2$topic[beta2$topic=="V2"]<-"T02 Qualité du service en ligne"
beta2$topic[beta2$topic=="V3"]<-"T03 Publicité en ligne et contenu personnalisé"
beta2$topic[beta2$topic=="V4"]<-"T04 Personnalisation du produit et régulation"
beta2$topic[beta2$topic=="V5"]<-"T05 Consumer exp dictionnary " #il a du mal
beta2$topic[beta2$topic=="V6"]<-"T06 Avis en ligne et sentiment "
beta2$topic[beta2$topic=="V7"]<-"T07 Customer online review and satisfaction"
beta2$topic[beta2$topic=="V8"]<-"T08 Tourism"
beta2$topic[beta2$topic=="V9"]<-"T09 Decoding Guest Satisfaction Through Airbnb Reviews"
beta2$topic[beta2$topic=="V10"]<-"T10 Social Media Analysis"
beta2$topic[beta2$topic=="V11"]<-"T11 Specific Macau"
beta2$topic[beta2$topic=="V12"]<-"T12 Online Brand Strategy"
beta2$topic[beta2$topic=="V13"]<-"T13 Text Data for Marketing Decisions"
beta2$topic[beta2$topic=="V14"]<-"T14 Social Media Engagement"
beta2$topic[beta2$topic=="V15"]<-"T15 Text Analytics in Market Segmentation"
beta2$topic[beta2$topic=="V16"]<-"T16 Ewom"

#la transparence rend compte de l'exclusitivité frex
stm_topics <- ggplot(beta2)+
  geom_text_wordcloud(aes(label=feature, size=p, alpha=2100-frex), color="darkblue")+
  facet_wrap(vars(topic), ncol=4)+
  scale_size_area(max_size = 7)+
  theme(strip.text = element_text(size = 10))

       width = 28, 
       height = 20, 
       units = "cm",
       dpi = 300)

4.4 Prevalence description (beta)



#pour separer les liste et les transformer en df 
#attention ça pertube 
foo1<- ldply(foo, data.frame)%>%


F<-data.frame(param=c("cte", "2015", "2019","2020","2021","2022","2023")) 

# autant de fois que K
param<-rbind(F, F, F, F, F, F, F, F, F, F,
             F, F, F, F, F ,F)

"T01 bibliometrics consumer",
"T02 Service quality review",
"T03 Ads and privacy",
"T04 Customization",
"T05 Consumer exp dictionnary ",
"T06 Product review sentiment ",
"T07 customer online review",
"T08 Tourism",
"T09 Airbnb",
"T10 Social media",
"T11 specific",
"T12 Branding",
"T13 marketing analytics",
"T14 engagement media sociaux",
"T15 marketing strategy",
"T16 e wom"

#autant de fois que de paramètres
R<-rbind(R, R, R, R, R, 
         R, R ) %>% arrange(topic)

foo2<-cbind(foo1, R, param)%>%
  pivot_longer(-c("topic","param"), names_to = "sim", values_to = "est") %>%
  dplyr::summarise(mean=mean(est, na.rm=TRUE),
            se =sd(est)/sqrt(24),
            t=abs(mean)/se) %>%
  dplyr::mutate(param= ifelse(param=="cte", "2010", param))

foo3<-cbind(foo1, R, param)%>%
  pivot_longer(-c("topic","param"), names_to = "sim", values_to = "est") %>%
  dplyr::summarise(mean=mean(est, na.rm=TRUE),
            se =sd(est)/sqrt(24),
            t=abs(mean)/se) %>%
  dplyr::select(-2,-4,-5) %>%
  dplyr::rename(cte = mean)

foo4<-foo2 %>%left_join(foo3) %>% 

# dplyr::mutate(time=ifelse(str_sub(param, 1,1)=="0" |str_sub(param, 1,1)=="1" , "no", "yes"))

stm_effect <- ggplot(foo4, aes(x=param,y=mean2,group=topic))+
#  geom_smooth(se = FALSE )+
  facet_wrap(vars(topic), scale="fixed",ncol =4)+
  theme(axis.text.y = element_text(size=5))+
  theme(axis.text.x = element_text(size=5))+
  labs(x=NULL, y="prévalence")


       width = 28, 
       height = 20,
       units = "cm",
       dpi = 300)

4.5 The law of entropy

nb.cols <- 16
mycolors <- colorRampPalette(brewer.pal(8, "Set2"))(nb.cols)

topics_plot <- ggplot(foo4, aes(x=param,y=mean2,group=topic))+
  geom_bar(stat="identity", aes(fill=topic))+
#  geom_smooth(se = FALSE )+
  theme(axis.text.y = element_text(size=5))+
  theme(axis.text.x = element_text(size=5))+
  labs(x=NULL, y="prévalence")+
  scale_fill_manual(values = mycolors)


  mutate(e=mean2*log(mean2)) %>%
  group_by(param) %>%

ggplot(foo5,aes(x=param, y=entropie, group = 1))+
  geom_point(stat = "identity")+geom_smooth(se=FALSE)+ylim(1.5,3)

5 Networks of authors

5.1 Check name of authors

We need to check if there are more than one unique authorname per authid. If so, we need to change the different names of author to the same name in order to have the exact same node per author later in the network.

test <- list_articles %>%
  group_by(authid) %>%
  select(authid, authname, id) %>%
  dplyr::mutate(n = n()) %>%

result <- test %>%
  group_by(authid) %>%
  filter(n_distinct(authname) > 1) %>%
  distinct(authid, .keep_all = TRUE)

result %>% reactable()
number_duplicates <- nrow(result)

cat("There are ", number_duplicates, " authors registered with different names.")
There are  21  authors registered with different names.

5.2 Correct the duplicate names

Let’s correct that by using one property of the distinct function: the .keep_all = TRUE parameter. It keeps the first occurrence of each group, which is the first row encountered for each unique combination of authid and authname. It will be faster than manually changing the name of each author.

# Merge list_articles with result on the authid column
merged_df <- left_join(list_articles, result, by = "authid")

# Replace authname values in list_articles with those from result
list_articles$authname <- ifelse(!$authname.y), merged_df$authname.y, list_articles$authname)

cat("There are", n_distinct(list_articles$id), "articles and", n_distinct(list_articles$authname), "authors overall in the data.")
There are 369 articles and 901 authors overall in the data.
# Write the updated dataframe to a CSV file 
write_csv2(list_articles, "nlp_full_data_final_unique_author_names.csv")

It is now done. We can check again if there are more than one unique authorname per authid.

5.3 Verification of duplicate names

test <- list_articles %>%
  group_by(authid) %>%
  select(authid, authname, id) %>%
  dplyr::mutate(n = n())

result <- test %>%
  group_by(authid) %>%
  filter(n_distinct(authname) > 1) %>%
  distinct(authid, .keep_all = TRUE) %>%

result %>% reactable()

5.4 Construct the networks

# Constants to include in the network as hovering text for nodes
    'affilname', 'affiliation_country', 'title', 'review', 'year', 'citedby_count', 'subjects_area', 'keywords'

def get_author_info(filtered_articles, columns):
    Given a DataFrame of filtered articles and a list of column names,
    this function collects author information and returns it as a dictionary.
    author_info = {col: {} for col in columns}
    author_info["citations"] = {}

    for _, row in filtered_articles.iterrows():
        author_name = row['authname']

        if pd.notna(row['citedby_count']):
            author_info["citations"][author_name] = author_info["citations"].get(author_name, 0) + row['citedby_count']

        for col in columns:
            value = row[col]
            if pd.notna(value):
                value = str(value).strip()
                if author_name in author_info[col]:
                    if value not in author_info[col][author_name]:
                        author_info[col][author_name] += " | " + value
                    author_info[col][author_name] = value

    return author_info

def sigma_graph(dataframe, start_year, end_year):
    Creates a graph representing author collaborations based on a given DataFrame of articles.
    Filters the articles based on the given start and end years.
    # Error handling
    if dataframe.empty:
        print("The DataFrame is empty.")
        return None, None

    for column in COLUMNS_TO_COLLECT:
        if column not in dataframe.columns:
            print(f"The DataFrame is missing the column: {column}")
            return None, None

    list_articles = dataframe
    filtered_articles = list_articles[(list_articles['year'] >= start_year) & (list_articles['year'] <= end_year)]

    author_pairs = []
    grouped = filtered_articles.groupby('id')[['authid', 'authname']].agg(list).reset_index()

    for _, row in grouped.iterrows():
        id = row['id']
        authors = row['authid']
        authnames = row['authname']

        if len(authors) == 1:
            author_pairs.append((id, authors[0], authors[0], authnames[0], authnames[0]))
        elif len(authors) > 1:
            author_combinations = list(combinations(range(len(authors)), 2))
            for i, j in author_combinations:
                author_pairs.append((id, authors[i], authors[j], authnames[i], authnames[j]))

    result_df = pd.DataFrame(author_pairs, columns=['id', 'authid1', 'authid2', 'authname1', 'authname2'])

    collaboration_df = result_df[["authname1", "authname2"]]
    collaboration_df = pd.DataFrame(np.sort(collaboration_df.values, axis=1), columns=collaboration_df.columns)
    collaboration_df['value'] = 1
    collaboration_df = collaboration_df.groupby(["authname1", "authname2"], sort=False, as_index=False).sum()

    G = nx.from_pandas_edgelist(collaboration_df, 'authname1', 'authname2', edge_attr='value', create_using=nx.Graph())

    for u, v in G.edges:
        G[u][v]["color"] = "#7D7C7C"

    for index, row in collaboration_df.iterrows():
        G.add_edge(row['authname1'], row['authname2'], weight=row['value'])

    metrics = {
        'centrality': nx.degree_centrality,
        'betweenness': nx.betweenness_centrality,
        'closeness': nx.closeness_centrality,
        'eigenvector_centrality': partial(nx.eigenvector_centrality, max_iter=1000),
        'burt_constraint_weighted': partial(nx.constraint, weight="value"),
        'burt_constraint_unweighted': nx.constraint

    for attr, func in metrics.items():
        nx.set_node_attributes(G, func(G), attr)

    author_info = get_author_info(filtered_articles, COLUMNS_TO_COLLECT)

    for col in COLUMNS_TO_COLLECT:
        nx.set_node_attributes(G, author_info[col], col)

    nx.set_node_attributes(G, author_info['citations'], 'citations')

    # Compute the inverse burt constraint to plot the lowest values as the biggest nodes
    # (= authors that are the less constrained in their network => multiple probable collaborations)
    for node in G.nodes:
        # Check if the 'burt_constraint_weighted' metric exists for the node
        if 'burt_constraint_weighted' in G.nodes[node]:
            burt_score = G.nodes[node]['burt_constraint_weighted']
            # Calculate the inverse, avoiding division by zero
            G.nodes[node]['inverse_burt_weighted'] = 1 / burt_score if burt_score != 0 else 0
            # Handle the case where the 'burt_constraint_weighted' metric is not available for this node
            # For example, by setting the value to None or a default value
            G.nodes[node]['inverse_burt_weighted'] = None  # or another default value
    # Compute Louvain commmunities
    partition = community_louvain.best_partition(G)
    for node, comm_number in partition.items():
      G.nodes[node]['community'] = comm_number
    # Color the graph using the greedy coloring algorithm with the 'largest_first' strategy
    colors = nx.greedy_color(G, strategy='largest_first', interchange=False)
    # Set the computed colors as an attribute to each node in the graph
    nx.set_node_attributes(G, colors, 'color')
    # Now, each node in the graph G has an attribute 'color' that corresponds to the color assigned by the greedy_color function
    # Computer HITS scores
    hubs, authorities = nx.hits(G, max_iter=100, nstart=None, normalized=False)
    # Set the computed scores as an attribute to each node in the graph
    nx.set_node_attributes(G, hubs, 'hub_score')
    nx.set_node_attributes(G, authorities, 'authority_score')

    # Creating a list of dictionaries containing the attributes of each node
    data_for_df = []
    for node in G.nodes(data=True):
        # `node` is a tuple (node_name, attributes_dict)
        node_data = node[1]  # Extracting the attributes dictionary
        node_data['Node'] = node[0]  # Adding the node name as an attribute
        # Adding the attributes dictionary of this node to the list
    # Creating a DataFrame from the list of dictionaries
    df_nodes = pd.DataFrame(data_for_df)

                 default_edge_type       = "curve",                                                     # Default edge type
                 clickable_edges         = True,                                                        # Clickable edges
                 edge_size               = "value",                                                     # Set edge size
                 fullscreen              = True,                                                        # Display in fullscreen
                 label_density           = 3,                                                           # Label density (= increase to have more labels appear at normal zoom level)
                 label_font              = "Helvetica Neue",                                            # Label font
                 max_categorical_colors  = 10,                                                          # Max categorical colors
                 node_border_color_from  = 'node',                                                      # Node border color from node attribute
                 node_color              = "community",                                                 # Set node colors
                 node_label_size         = "citations",                                                 # Node label size
                 #node_label_size_range   = (12, 36),                                                    # Node label size range
                 node_label_size_range   = (12, 30),                                                    # Node label size range
                 #node_metrics            = {"community": {"name": "louvain", "resolution": 1}},         # Specify node metrics
                 node_size               = "citations",                                                 # Node size
                 #node_size_range         = (3, 30),                                                     # Node size range
                 node_size_range         = (2, 20),                                                     # Node size range
                 path                    = f"networks/authors/{start_year}_{end_year}_VF.html",   # Output file path
                 start_layout            = 3,                                                           # Start layout algorithm
                 #node_border_color      = "black",                                                     # Node border color
                 #edge_color             = "#7D7C7C"                                                    # Edge color
                 # node_label_color      = "community"                                                  # Node label color

    return G, df_nodes

5.5 Create the overall network (from 1996 to 2023)

print("The year range of the publications is from {} to {}".format(int(list_articles['year'].min()), int(list_articles['year'].max())))
The year range of the publications is from 1996 to 2023

#Let's try each year from 2015 as end year onwards

G_1996_2015, df_1996_2015 = sigma_graph(list_articles, 1996, 2015)
G_1996_2016, df_1996_2016 = sigma_graph(list_articles, 1996, 2016)
G_1996_2017, df_1996_2017 = sigma_graph(list_articles, 1996, 2017)
G_1996_2018, df_1996_2018 = sigma_graph(list_articles, 1996, 2018)
G_1996_2019, df_1996_2019 = sigma_graph(list_articles, 1996, 2019)
G_1996_2020, df_1996_2020 = sigma_graph(list_articles, 1996, 2020)
G_1996_2021, df_1996_2021 = sigma_graph(list_articles, 1996, 2021)
G_1996_2022, df_1996_2022 = sigma_graph(list_articles, 1996, 2022)
G_1996_2023, df_1996_2023 = sigma_graph(list_articles, 1996, 2023)

networks = {
    '1996-2015': G_1996_2015,
    '1996-2016': G_1996_2016,
    '1996-2017': G_1996_2017,
    '1996-2018': G_1996_2018,
    '1996-2019': G_1996_2019,
    '1996-2020': G_1996_2020,
    '1996-2021': G_1996_2021,
    '1996-2022': G_1996_2022,
    '1996-2023': G_1996_2023

6 Analysis of the networks

Let’s sort by the Burt constraint weighted and see the top 10 authors with the lowest Burt constraint weighted. You can filter by different metrics: Burt’s constraint, number of citations, betweenness centrality, closeness centrality, hub score, authority score (HITS)…

#create column "rank" based on citations
df_1996_2023 <- py$df_1996_2023 %>%
  dplyr::mutate(rank_citations = as.integer(rank(desc(citations))))

reactable_burt <- df_1996_2023 %>%
  dplyr::select(Node, rank_citations, citations, burt_constraint_weighted, betweenness, closeness, hub_score, authority_score, affilname, affiliation_country, community) %>%
# center headers in reactable
          searchable = TRUE,
          defaultPageSize = 5,
          highlight = TRUE,
          bordered = TRUE,
          striped = TRUE,
          compact = TRUE,
          defaultColDef = colDef(
              header = function(value) gsub("_", " ", (capitalize(value)), fixed = TRUE),
              cell = function(value) format(value, nsmall = 1),
              align = "center",
              minWidth = 70,
              headerStyle = list(background = "#f7f7f8")
# Select only the numeric columns from the dataframe
df_numeric <- df_1996_2023[sapply(df_1996_2023, is.numeric)]

# Calculate the correlation matrix and round the values to one decimal point
corr <- round(cor(df_numeric, use = "complete.obs"), 2)

# Calculate a matrix of p-values for the correlations
p.mat <- cor_pmat(df_numeric)

# Visualize the lower triangle of the correlation matrix using ggcorrplot
# The heatmap color indicates the correlation coefficient between variables
# 'hc.order' is set to TRUE to reorder the matrix using hierarchical clustering
# 'type' is set to "lower" to show only the lower triangle of the matrix
# 'outline.col' is set to "white" to define the border color around the squares
corr.plot <- ggcorrplot(
  corr, hc.order = TRUE, type = "lower", outline.col = "white",
  p.mat = p.mat

# Convert the ggcorrplot object to an interactive plotly object
# This allows for interactive features such as tooltips on hover
# Compute correlation coefficients
cor.coef <- cor(df_numeric)

# Compute correlation p-values
cor.test.p <- function(x){
    FUN <- function(x, y) cor.test(x, y)[["p.value"]]
    z <- outer(
      Vectorize(function(i,j) FUN(x[,i], x[,j]))
    dimnames(z) <- list(colnames(x), colnames(x))
p <- cor.test.p(df_numeric)

# Create the heatmap
  node_type = "scatter",
  point_size_mat = -log10(p), 
  point_size_name = "-log10(p-value)",
  label_names = c("x", "y", "Correlation")

6.1 An interesting metric: the graph density

The graph density is the ratio of the number of edges to the maximum number of possible edges. It is a measure of the proportion of edges present in a graph. A graph with a high density has a large number of edges compared to the number of nodes. A graph with a low density has a small number of edges compared to the number of nodes.

A more formal definition is given here by the following formulas:

  • For undirected graphs:

    \[ \begin{equation}d=\frac{2 m}{n(n-1)}\end{equation} \]

  • For undirected graphs:

\[ \begin{equation}d=\frac{m}{n(n-1)}\end{equation} \]

where \(n\) is the number of nodes and \(m\) is the number of edges in the graph.

From an interpretation standpoint, we can appreciate the density in the graphs bellow as follows:

\(d\) Interpretation
Close to \(0\)
  • The collaborative relationships among authors are sparse:

  • Authors have limited connections with each other outside of their community.

  • Scientific papers are primarily the work of individual authors or small isolated groups.

Close to \(1\)
  • Authors frequently collaborate with one another, leading to a web of interconnected scientific collaborations.

  • Scientific papers often involve contributions from multiple authors, reflecting a high level of teamwork and interdisciplinary research.

  • Collaborations are a significant aspect of the research process in this marketing field, and authors actively seek out opportunities to work together.

  • The network of collaborations is well-established and robust, facilitating the exchange of ideas and the advancement of scientific knowledge.

Let’s compare the density of multiple networks over time :

# Compute Density of all the networks created above with networkx

density_df = pd.DataFrame(columns=["network", "density"])

temp_dfs = []

for network_name, network in networks.items():
    density = nx.density(network)
    nb_nodes = network.number_of_nodes()
    nb_edges = network.number_of_edges()
    linear_density = nb_nodes / nb_edges if nb_edges != 0 else 0
    nb_communities = len(set(nx.get_node_attributes(network, "community").values()))
    nb_mean_nodes_per_community = nb_nodes / nb_communities
    temp_df = pd.DataFrame([{
        "network": network_name, 
        "density": density,
        "nb_edges": nb_edges,
        "nb_nodes": nb_nodes,
        "linear_density": linear_density,
        "nb_communities": nb_communities,
        "nb_mean_nodes_per_community": nb_mean_nodes_per_community

# Concat all the temp_dfs into one dataframe
density_df = pd.concat(temp_dfs, ignore_index=True)
density_df <- py$density_df

density_long <- density_df %>%
    cols = -network, # Exclude column
    names_to = "metric", # Name of new column for the metrics
    values_to = "value" # Name of new column for the values
  ) %>%
  filter(!metric %in% c("linear_density","nb_nodes")) %>%
  mutate(value = round(value, 3))

metric_titles <- c(density = "Density", nb_edges = "Number of Collaborations", nb_communities = "Number of Communities", nb_mean_nodes_per_community = "Mean Number of Authors per Community")

# Créer le graphe avec ggplot
density_plot <- ggplot(density_long, aes(x = network, y = value)) +
  geom_line(aes(group = metric), color = "steelblue") +
  geom_point(color = "steelblue") +
  labs(title = "",
       x = "Networks",
       y = "Value") +
  facet_wrap(~ metric, scales = "free_y", labeller = labeller(metric = metric_titles)) +
  theme(plot.title = element_text(hjust = 0.5))

ggsave("images/density_networks.png", density_plot, width = 14, height = 8, dpi = 300)


6.2 Communities

6.2.1 Pareto chart of citations per community

# Let's look at the communities of the network 1996-2023

top5_communities <- df_1996_2023 %>%
  group_by(community) %>%
  dplyr::summarise(n = n()) %>%
  arrange(desc(n)) %>%

# Compute the number of community per network
filtered_df <- df_1996_2023 %>%
  filter(community %in% top5_communities$community)

# Compute the sum of citations per community
citations_per_community <- df_1996_2023 %>%
  group_by(community) %>%
  dplyr::summarise(citations = sum(citations)) %>%
  arrange(-citations) %>%
  dplyr::mutate(cumulative_citations = cumsum(citations),
         percentage = cumulative_citations / sum(citations) * 100)

citations_per_community %>% datatable(rownames = FALSE, extensions = 'Buttons', options = list(dom = 'Bfrtip', buttons = c('copy', 'csv', 'excel', 'pdf', 'print')))
# No ticks and labels
ggplot(citations_per_community, aes(x = reorder(community, -citations), y = citations)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_point(aes(y = cumulative_citations), group = 1, color = "darkred", size = 1) +
    name = "Cumulative number of citations", 
    sec.axis = sec_axis(~./max(citations_per_community$cumulative_citations) * 100, name = "Cumulative percentage")
  ) +
  geom_hline(yintercept = max(citations_per_community$cumulative_citations) * 0.25, linetype = "dashed", color = "black") +
  geom_hline(yintercept = max(citations_per_community$cumulative_citations) * 0.50, linetype = "dashed", color = "black") +
  geom_hline(yintercept = max(citations_per_community$cumulative_citations) * 0.75, linetype = "dashed", color = "black") +
  theme_bw() +
    panel.border = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(colour = "black"),
    axis.ticks.x = element_blank(),  # Supprimer les marques de graduation sur l'axe des x
    axis.text.x = element_blank()  # Rendre les étiquettes de l'axe des x invisibles
  ) +
    title = "Pareto chart of citations per community",
    x = "Community",
    y = "Number of Citations"

ggsave("images/pareto_chart_ggplot_no_xtickslabels.png", width = 20, height = 10, dpi = 300)

6.2.2 Proportion of the top 5 communities in terms of citations and authors

# Sum of all citations from filtered_df for each community of the top 5
top5_communities_citations <- filtered_df %>%
  group_by(community) %>%
  dplyr::summarise(citations = sum(citations)) %>%

sum_citations_df_1996_2023 <- sum(df_1996_2023$citations)

# Compute the proportion of citations and number of authors for each community
top5_communities_citations <- top5_communities_citations %>% 
  left_join(top5_communities, by = "community") %>%
  dplyr::mutate(prop_citations = citations / sum_citations_df_1996_2023) %>%
  dplyr::mutate(prop_authors = n / nrow(df_1996_2023)) %>%
  dplyr::mutate(cumul_prop_citations = round(cumsum(prop_citations) * 100,2)) %>%
  dplyr::mutate(cumul_prop_authors = round(cumsum(prop_authors) * 100,2)) %>% 
  dplyr::mutate(prop_citations = round(prop_citations * 100,2)) %>%
  dplyr::mutate(prop_authors = round(prop_authors * 100,2)) %>%

gt_top5_communities_citations <- top5_communities_citations %>%
  gt() %>%
  tab_header(title = "Proportion of citations and authors for each community") %>%
  cols_label(community = "Community",
             citations = "Number of citations",
             n = "Number of authors",
             prop_citations = "Proportion of citations (in %)",
             prop_authors = "Proportion of authors (in %)",
             cumul_prop_citations = "Cumulative proportion of citations (in %)",
             cumul_prop_authors = "Cumulative proportion of authors (in %)") %>%
  cols_align(align = "center", columns = everything())

# save gt in png
Proportion of citations and authors for each community
Community Number of citations Number of authors Proportion of citations (in %) Proportion of authors (in %) Cumulative proportion of citations (in %) Cumulative proportion of authors (in %)
21 6050 28 18.76 3.03 18.76 3.03
9 4533 32 14.06 3.47 32.81 6.50
45 2524 23 7.83 2.49 40.64 8.99
29 1321 23 4.10 2.49 44.74 11.48
57 403 20 1.25 2.17 45.99 13.65
