Introduction
This notebook shows how to customize the different parts of the BERTopic pipeline: Embeddings, UMAP and HDBSCAN. See this Explainer and tutorial by pinecone.io ) for details.
Imports and Login
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import json
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import seaborn as sns
import time
import torch
import umap
import hdbscan
Load publication data
df = pd.read_csv(INPUT_FILE)
df = df.drop_duplicates(subset= ["doi" ])\
.dropna(subset= ["doi" , "abstract" ]).reset_index(drop= True )
sentences = (df["title" ] + " " + df["abstract" ]).values
Embeddings
Compute or load sentence embeddings
if COMPUTE_EMBEDDINGS:
embedding_model = SentenceTransformer('allenai-specter' )
embeddings = embedding_model.encode(sentences, normalize_embeddings= True )
np.save(OUTPUT_FILE_TOPICS_EMBEDDINGS, embeddings)
else :
embeddings = np.load(OUTPUT_FILE_TOPICS_EMBEDDINGS)
Sample embeddings
perm = torch.randperm(embeddings.shape[0 ])
idx = perm[:3000 ]
samples = embeddings[idx, :]
UMAP
Hyperparameter search for n_neighbors
fig, ax = plt.subplots(3 , 3 , figsize= (14 , 14 ))
nns = [2 , 3 , 4 , 5 , 10 , 15 , 30 , 50 , 100 ]
i, j = 0 , 0
for n_neighbors in tqdm(nns):
fit = umap.UMAP(n_neighbors= n_neighbors)
u = fit.fit_transform(samples)
sns.scatterplot(x= u[:,0 ], y= u[:,1 ], ax= ax[j, i])
ax[j, i].set_title(f'n= { n_neighbors} ' )
if i < 2 : i += 1
else : i = 0 ; j += 1
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
Hyperparameter search for min_dist
n_neighbors = 5
fig, ax = plt.subplots(2 , 3 , figsize= (14 , 14 ))
min_dists = [0.01 , 0.05 , 0.10 , 0.33 , 0.5 , 1.0 ]
i, j = 0 , 0
for min_dist in tqdm(min_dists):
fit = umap.UMAP(n_neighbors= n_neighbors, min_dist= min_dist)
u = fit.fit_transform(samples)
sns.scatterplot(x= u[:,0 ], y= u[:,1 ], ax= ax[j, i])
ax[j, i].set_title(f'n= { n_neighbors} , min_dist= { min_dist} ' )
if i < 2 : i += 1
else : i = 0 ; j += 1
Fit UMAP
min_dist = 0.1
n_neighbors = 5
umap_fitter = umap.UMAP(n_neighbors= n_neighbors, min_dist= min_dist)
u = umap_fitter.fit_transform(embeddings)
sns.scatterplot(x= u[:,0 ], y= u[:,1 ], alpha= 0.01 , size= 0.05 )
HDBSCAN
Clustering with defaults
clusterer = hdbscan.HDBSCAN()
clusterer.fit(u)
clusterer.condensed_tree_.plot(select_clusters= True )
<AxesSubplot:ylabel='$\\lambda$ value'>
Clustering with optimized parameters
clusterer = hdbscan.HDBSCAN(
min_cluster_size= 200 ,
min_samples= 100 ,
gen_min_span_tree= True ,
prediction_data= True
)
clusterer.fit(u)
clusterer.condensed_tree_.plot(select_clusters= True )
<AxesSubplot:ylabel='$\\lambda$ value'>
Custom BERTopic Model
BERTopic with customized models
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('stopwords' )
stopwords = list (stopwords.words('english' )) + ["covid" , "covid 19" , "19" , "sars" , "cov" , "sars cov" , "patients" , "epidemic" , "pandemic" , "health" , "sec" , "sec sec" ]
vectorizer_model = CountVectorizer(ngram_range= (1 , 2 ), stop_words= stopwords)
if COMPUTE_TOPICS:
topic_model = BERTopic(
umap_model= umap_fitter,
hdbscan_model= clusterer,
embedding_model= embedding_model,
vectorizer_model= vectorizer_model,
calculate_probabilities= True ,
top_n_words= 5 ,
language= 'english' ,
verbose= True
)
topics, probs = topic_model.fit_transform(sentences)
topic_model.save(OUTPUT_FILE_TOPICS_BERTOPIC, save_embedding_model= False )
else :
topic_model = BERTopic.load(OUTPUT_FILE_TOPICS_BERTOPIC)
Visualize topics
topic_model.visualize_barchart(top_n_topics= 50 )