NLP Workshop
November 9, 2023
There are 405 articles and 995 unique authors
Date of publication range from 1996 to 2023
There are 21176 unique references (27710 overall) and 11214 unique first authors
Date of publication range from 1879 to 2023
The author names are now unique.
The “.keep_all = TRUE
” property keeps the first occurrence of each group, which is the first row encountered for each unique combination of authid and authname.
Author1 | Author2 |
---|---|
Paul | Luc |
Paul | Claire |
Paul | Anne |
Luc | Claire |
Luc | Anne |
Claire | Anne |
# Filter articles by a range of years using the 'year' column.
filtered_articles = list_articles[(list_articles['year'] >= start_year) & (list_articles['year'] <= end_year)]
# Initialize an empty list to hold the author pairs.
author_pairs = []
# Group the filtered articles by 'entry_number' and aggregate 'authid' and 'authname' into lists.
grouped = filtered_articles.groupby('entry_number')[['authid', 'authname']].agg(list).reset_index()
# Iterate over each grouped entry.
for _, row in grouped.iterrows():
# Get the entry number from the row.
entry_number = row['entry_number']
# Get the list of author IDs for this entry.
authors = row['authid']
# Get the list of author names for this entry.
authnames = row['authname']
# If there is only one author, append a pair of the same author to the list.
if len(authors) == 1:
author_pairs.append((entry_number, authors[0], authors[0], authnames[0], authnames[0]))
# If there is more than one author, create all possible unique pairs.
elif len(authors) > 1:
# Create combinations of author indices (0, 1), (0, 2), etc.
author_combinations = list(combinations(range(len(authors)), 2))
# For each combination of indices, append the corresponding author IDs and names to the list.
for i, j in author_combinations:
author_pairs.append((entry_number, authors[i], authors[j], authnames[i], authnames[j]))
# Create a DataFrame from the list of author pairs with specified column names.
result_df = pd.DataFrame(author_pairs, columns=['entry_number', 'authid1', 'authid2', 'authname1', 'authname2'])
# Extract only the columns with author names to create a collaboration DataFrame.
collaboration_df = result_df[["authname1", "authname2"]]
Author1 | Author2 | Weight |
---|---|---|
Paul | Luc | 1 |
Claire | Anne | 1 |
Claire | Louis | 1 |
Luc | Paul | 1 |
Author1 | Author2 | Weight |
---|---|---|
Paul | Luc | 2 |
Claire | Anne | 1 |
Claire | Louis | 1 |
# Create a graph from a pandas DataFrame using 'authname1' and 'authname2' as the nodes
# and 'value' as the edge weight
G = nx.from_pandas_edgelist(collaboration_df, 'authname1', 'authname2',
edge_attr='value', create_using=nx.Graph())
# Set a default color for all edges in the graph
for u, v in G.edges:
G[u][v]["color"] = "#7D7C7C"
# Define a dictionary of network analysis functions to compute different centrality metrics
metrics = {
'centrality': nx.degree_centrality, # Basic centrality measure based on degree
'betweenness': nx.betweenness_centrality, # Measure of a node's bridging of paths
'closeness': nx.closeness_centrality, # Measure of average distance to all other nodes
'eigenvector_centrality': partial(nx.eigenvector_centrality, max_iter=1000), # Measure of node influence
'burt_constraint_weighted': partial(nx.constraint, weight="value"), # Measure of node's constraint considering edge weights
'burt_constraint_unweighted': nx.constraint # Measure of node's constraint ignoring edge weights
}
# Apply each centrality metric to the graph and set it as a node attribute
for attr, func in metrics.items():
nx.set_node_attributes(G, func(G), attr)
# Retrieve author information from the filtered articles using a custom function
author_info = get_author_info(filtered_articles, COLUMNS_TO_COLLECT)
# Set additional author attributes to the graph nodes based on the author_info
for col in COLUMNS_TO_COLLECT:
nx.set_node_attributes(G, author_info[col], col)
from pyvis.network import Network
net = Network(notebook=True, cdn_resources='remote', width=1500, height=1500, bgcolor="white", font_color="black")
#net.show_buttons(filter_=['physics'])
net.set_options("""
const options = {
"physics": {
"forceAtlas2Based": {
"gravitationalConstant": -13,
"centralGravity": 0.015,
"springLength": 70
},
"minVelocity": 0.75,
"solver": "forceAtlas2Based"
}
}
""")
net.from_nx(G)
net.show("networks/authors/network_2022_2023_pyvis.html")
Sigma.write_html(G,
default_edge_type = "curve", # Default edge type
clickable_edges = True, # Clickable edges
edge_size = "value", # Set edge size
fullscreen = True, # Display in fullscreen
label_density = 3, # Label density (= increase to have more labels appear at normal zoom level)
label_font = "Helvetica Neue", # Label font
max_categorical_colors = 10, # Max categorical colors
node_border_color_from = 'node', # Node border color from node attribute
node_color = "community", # Set node colors
node_label_size = "citations", # Node label size
node_label_size_range = (12, 36), # Node label size range
node_metrics = {"community": {"name": "louvain", "resolution": 1}}, # Specify node metrics
node_size = "citations", # Node size
node_size_range = (3, 30), # Node size range
path = f"networks/authors/{start_year}_{end_year}_sigma_v2.html", # Output file path
start_layout = 3, # Start layout algorithm
#node_border_color = "black", # Node border color
#edge_color = "#7D7C7C" # Edge color
# node_label_color = "community" # Node label color
)
return G, df
Code | Slides | Personal Github |
---|---|---|
![]() |
![]() |
![]() |