from datetime import date
print("Last Render Time:", date.today())

Last Render Time: 2023-05-18


# Conda environment: BERTTM 
# In terminal: 
# conda activate BERTTM

# Import packages
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import seaborn as sns
from umap import UMAP
from hdbscan import HDBSCAN
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from tqdm.auto import tqdm
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer


# Import text data
goals = pd.read_csv('/Users/jannis/SynologyDrive/PhD/Phd Research [shared]/Acculturation over Time - Qualitative Goals/need-content/data/keyMotiveOutgroupInteraction.csv')

# text data to list
goals_list = goals['text'].tolist()

# text data to list
goals_dict = goals['text'].to_dict()


# remove duplicates for the dimension reduction to work correctly
goals_dedup = list(dict.fromkeys(goals_list))

# report N.s
print("Note: Before deduplication: ", len(goals_list), ", after deduplication ", len(goals_dedup), sep="")

# Spelling correction
from itertools import islice
import pkg_resources
from symspellpy import SymSpell, Verbosity

# load spelling dictionaries
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt"
)
bigram_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_bigramdictionary_en_243_342.txt"
)

# term_index is the column of the term and count_index is the column of the term frequency
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

# Which goal list to use
input_list = goals_dedup    

# Apply spelling correction to all documents
goals_spelling = []
for l in range(len(input_list)):
    input_term = input_list[l]
    suggestions = sym_spell.lookup_compound(input_term, max_edit_distance=2)
    # suggestions = sym_spell.lookup(input_term, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True)
    list_inner = ""
    for suggestion in suggestions:
        list_inner = list_inner + suggestion.term
    goals_spelling.append(list_inner)


# Decapitalization (probably not necessary becuase it's already part of the embedding)
goals_spelling_lower = list(map(lambda x: x.lower(), goals_spelling))

# Stemming (optional)
porter = PorterStemmer()
goals_stem = []
for sentence in goals_spelling_lower:
    goals_stem.append(" ".join([porter.stem(i) for i in sentence.split()]))
    
# remove stopwords
stop_words = stopwords.words('english')
goals_stem_stop = []
for sentence in goals_stem:
    goals_stem_stop.append(" ".join([i for i in sentence.split() if i not in stop_words]))
    
# join goals_dedup, goals_spelling_lower, and goals_stem in one data frame
goals_processed_df = pd.DataFrame({'goals_dedup': goals_dedup, 'goals_spelling_lower': goals_spelling_lower, 'goals_stem': goals_stem, 'goals_stem_stop': goals_stem_stop})

Note: Before deduplication: 2983, after deduplication 1851


# Step 1 - Extract embeddings
# chose embedding model
embedding_model = SentenceTransformer("all-mpnet-base-v2")

# Embed goals in batches of 16
data = goals_spelling_lower

n = len(data)
batch_size = 16

embeds = np.zeros((n, embedding_model.get_sentence_embedding_dimension()))

for i in tqdm(range(0, n, batch_size), disable=True):
    i_end = min(i+batch_size, n)
    batch = data[i:i_end]
    batch_embed = embedding_model.encode(batch)
    embeds[i:i_end,:] = batch_embed


# # check the 2D distributions for varying values of n_neighbors and min_dist
# nns = list(range(3, 19))
# nns.extend([30, 50, 100, 250])
# min_dist = [0.00, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10, 0.20]

# for dist in tqdm(min_dist, disable=True):
#   fig, ax = plt.subplots(5, 4, figsize=(16, 16))
#   fig.suptitle(f'min_dist={dist}')
#   i, j = 0, 0
#   for n_neighbors in tqdm(nns, disable=True):
#       fit = UMAP(n_neighbors=n_neighbors, min_dist=dist, random_state=7)
#       u = fit.fit_transform(embeds)
#       sns.scatterplot(x=u[:,0], y=u[:,1], ax=ax[j, i])
#       ax[j, i].set_title(f'n={n_neighbors}')
#       if i < 3: i += 1
#       else: i = 0; j += 1
#   fig.subplots_adjust(top=0.95)
#   plt.savefig(f'figures/umap/intergroup/min_dist={dist}.pdf')
#   plt.savefig(f'figures/umap/intergroup/min_dist={dist}.png', dpi=300)


import glob
import matplotlib.image as mpimg

images = [mpimg.imread(file) for file in sorted(glob.glob('figures/umap/intergroup/min_dist=*.png'))]
rows = 2
columns = 6

fig = plt.figure(figsize=(90, 30))

for image in range(len(images)):
  fig.add_subplot(rows, columns, image+1)
  plt.imshow(images[image])
  plt.axis('off')
  
fig.savefig('figures/umap/intergroup/min_dist.png')


# 3D UMAP check
palette = ['#1c17ff', '#faff00', '#8cf1ff', '#738FAB', '#030080', '#738fab']

# alternative = [(11, 0.01), (8, 0.04), (6, 0.04), (7, 0.04), ()]

nneighbors = 10
mindist = 0.04

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=nneighbors, 
                  n_components=3, 
                  min_dist=mindist, 
                  metric='cosine',
                  random_state=7)

fit = UMAP(n_neighbors=nneighbors, n_components=3, min_dist=mindist, metric='cosine', random_state=7) #7
u = fit.fit_transform(embeds)

fig = px.scatter_3d(
    x=u[:,0], y=u[:,1], z=u[:,2],
    # color=data[:n],
    custom_data=[data[:n]],
    color_discrete_sequence=[palette[0]]
)
fig.update_traces(
    hovertemplate="<br>".join([
        "text: %{customdata[0]}"
    ]),
    marker_size = 1
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


from IPython.display import IFrame

fig.write_html("outgroup-interactions-umap-topics-3d.html", include_plotlyjs="cdn", full_html=False)
IFrame(src='outgroup-interactions-umap-topics-3d.html', width='100%', height=600)


# Tune HDBSCAN hyper parameters
from sklearn.preprocessing import normalize
from IPython.display import display, HTML

min_clust = list(range(5, 51, 5))
min_sample = list(range(5, 40, 5))
clust_pam = []

for clust in tqdm(min_clust, disable=True):
  for sample in tqdm(min_sample, disable=True):
    clusterer = HDBSCAN(
      min_cluster_size=clust, # merges smaller clusters
      min_samples=sample # allowing sparser clusters to be pulled in
      ) 
    clusterer.fit(u)
    clust_pam.append(
      {
        'min_clust': clust,
        'min_sample': sample,
        'n_clusters': len(set(clusterer.labels_)),
        'n_noise': list(clusterer.labels_).count(-1),
        'p_noise': round(list(clusterer.labels_).count(-1)/len(data)*100, 2)
      }
    )
    # print("[min_clust=", clust,
    #       ", min_sample=", sample, "] ", 
    #       len(set(clusterer.labels_)), " clusters, unclassified = ", 
    #       list(clusterer.labels_).count(-1), 
    #       " (i.e.,", round(list(clusterer.labels_).count(-1)/len(data)*100, 2), "%)", sep='')
    
clust_pam = pd.DataFrame(clust_pam)
clust_pam['prod'] = (normalize([clust_pam['n_clusters']])*normalize([clust_pam['n_noise']])*100)[0]

display(HTML(clust_pam.sort_values('prod').to_html()))
#clust_pam.sort_values('prod').head(40)


# Check HDBSCAN hyper parameters
min_cluster_size = 15 #35
min_samples = 5 #5

clusterer = HDBSCAN(
  min_cluster_size=min_cluster_size, # merges smaller clusters
  min_samples=min_samples # allowing sparser clusters to be pulled in
  ) 
clusterer.fit(u)

print(len(set(clusterer.labels_)), " clusters, unclassified = ", list(clusterer.labels_).count(-1), " (i.e.,", round(list(clusterer.labels_).count(-1)/len(data)*100, 2), "%)", sep='')

clusterer.condensed_tree_.plot(select_clusters=True)

47 clusters, unclassified = 308 (i.e.,16.64%)

<AxesSubplot:ylabel='$\\lambda$ value'>


colors = [str(x) for x in clusterer.labels_]

fig = px.scatter_3d(
    x=u[:,0], y=u[:,1], z=u[:,2],
    color=colors,
    custom_data=[data[:n]]
)
fig.update_traces(
    hovertemplate="<br>".join([
        "text: %{customdata[0]}"
    ]),
    marker_size = 2
)


fig.write_html("outgroup-interactions-HDBSCAN-topics-3d.html", include_plotlyjs="cdn", full_html=False)
IFrame(src='outgroup-interactions-HDBSCAN-topics-3d.html', width='100%', height=600)


# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, # merges smaller clusters
                        min_samples=min_samples) # allowing sparser clusters to be pulled in


# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")


# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()


# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,    # Step 1 - Extract embeddings
  umap_model=umap_model,              # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
  diversity=0.2                       # Step 6 - Diversify topic words
  #top_n_words=10
  # min_topic_size=20,
  # language='english',
  # calculate_probabilities=True,
  # verbose=True
)
topics, probs = topic_model.fit_transform(goals_spelling_lower)


# for each topic in topic_model.get_topics() get the topic and the words
topic_labs = topic_model.generate_topic_labels(10)

# topic_labs to dataframe and split into Topic and Label columns based on first underdash
topic_labs_df = pd.DataFrame(topic_labs)
topic_labs_df[['topic', 'top_n_words']] = topic_labs_df[0].str.split('_', 1, expand=True)

# replace underscores with commas and spaces in Label column
topic_labs_df['top_n_words'] = topic_labs_df['top_n_words'].str.replace('_', ', ')

# drop first column
topic_labs_df = topic_labs_df.drop(columns=[0])

# change type of Topic column
topic_labs_df['topic'] = topic_labs_df['topic'].astype(int)

# get representative documents for each topic
topic_documents = pd.DataFrame({'topic': topic_model.get_representative_docs().keys(), 'representative_documents': topic_model.get_representative_docs().values()})

# split the representative documents column into three columns
topic_documents[['representative_documents_1', 'representative_documents_2', 'representative_documents_3']] = pd.DataFrame(topic_documents['representative_documents'].tolist(), index=topic_documents.index)

# drop representative_documents column
topic_documents = topic_documents.drop(columns=['representative_documents'])

# left_join goals_topics with topic_documents on topic
topic_labs_df = topic_labs_df.merge(topic_documents, how='left', left_on='topic', right_on='topic')

# add topic and probability to goals_processed_df in a new data frame called goals_topics
goals_topics = goals_processed_df.copy()
goals_topics['topic'] = topics
goals_topics['prob'] = probs

# left_join goals_topics with topic_labs_df on topic
goals_topics = goals_topics.merge(topic_labs_df, how='left', left_on='topic', right_on='topic')

# left_join the goals_topics data frame with the goals data frame
goals_full = goals.merge(goals_topics, how='left', left_on='text', right_on='goals_dedup')


# export topic_labs_df, goals_topics, and goals_full to csv into the output folder
topic_labs_df.to_csv('output/outgroup_interactions-topic_with_top_n_words.csv', index=False)
goals_topics.to_csv('output/outgroup_interactions-dedup_goals_with_topics.csv', index=False)
goals_full.to_csv('output/outgroup_interactions-all_ppt_goals_with_topics.csv', index=False)


print(len(topic_model.get_topic_info()), "topics")

display(topic_model.get_topic_info())

47 topics


topic_model.visualize_barchart(top_n_topics=len(topic_model.get_topic_info()))


topicBar = topic_model.visualize_barchart(top_n_topics=len(topic_model.get_topic_info()))
# topicBar.write_image("outgroup-interactions-topicBar.pdf")
topicBar.write_html("outgroup-interactions-topicBar.html", include_plotlyjs="cdn", full_html=False)
IFrame(src='outgroup-interactions-topicBar.html', width='100%', height=600)


images = mpimg.imread("topics-themes.png")

fig = plt.figure(figsize=(90, 30))
plt.imshow(images)
plt.axis('off')

(-0.5, 11921.5, 6706.5, -0.5)


topic_model.visualize_topics()


distmap = topic_model.visualize_topics()
distmap.write_html("outgroup-interactions-distmap.html", include_plotlyjs="cdn", full_html=False)
IFrame(src='outgroup-interactions-distmap.html', width='100%', height=600)


topic_model.visualize_heatmap(n_clusters=len(topic_model.get_topic_info())-2, width=1000, height=1000)


similarityMat = topic_model.visualize_heatmap(n_clusters=len(topic_model.get_topic_info())-2, width=1000, height=1000)
similarityMat.write_html("outgroup-interactions-similaritymat.html", include_plotlyjs="cdn", full_html=False)
IFrame(src='outgroup-interactions-similaritymat.html', width='100%', height=600)


topic_model.visualize_hierarchy(top_n_topics=len(topic_model.get_topic_info()))


hclust = topic_model.visualize_hierarchy(top_n_topics=len(topic_model.get_topic_info()))
hclust.write_html("outgroup-interactions-hclust.html", include_plotlyjs="cdn", full_html=False)
IFrame(src='outgroup-interactions-hclust.html', width='100%', height=600)


# export jupyter notebook to self-contained html
# !jupyter nbconvert --to html_embed --output-dir='.' BERT-topic-model-outgroup.ipynb

# export jupyter notebook to pdf
# !jupyter nbconvert --to pdf --output-dir='.' BERT-topic-model-outgroup.ipynb

import os
os.system("jupyter nbconvert --to html_embed --output-dir='.' BERT-topic-model-outgroup.ipynb")

	min_clust	min_sample	n_clusters	n_noise	p_noise	prod
34	25	35	2	0	0.00	0.000000
55	40	35	2	0	0.00	0.000000
48	35	35	2	0	0.00	0.000000
13	10	35	2	0	0.00	0.000000
27	20	35	2	0	0.00	0.000000
62	45	35	2	0	0.00	0.000000
20	15	35	2	0	0.00	0.000000
41	30	35	2	0	0.00	0.000000
6	5	35	2	0	0.00	0.000000
45	35	20	4	14	0.76	0.008474
44	35	15	4	14	0.76	0.008474
53	40	25	4	14	0.76	0.008474
26	20	30	4	14	0.76	0.008474
47	35	30	4	14	0.76	0.008474
40	30	30	4	14	0.76	0.008474
33	25	30	4	14	0.76	0.008474
51	40	15	4	14	0.76	0.008474
52	40	20	4	14	0.76	0.008474
39	30	25	4	14	0.76	0.008474
46	35	25	4	14	0.76	0.008474
54	40	30	4	14	0.76	0.008474
38	30	20	4	14	0.76	0.008474
56	45	5	4	14	0.76	0.008474
57	45	10	4	14	0.76	0.008474
58	45	15	4	14	0.76	0.008474
12	10	30	4	14	0.76	0.008474
59	45	20	4	14	0.76	0.008474
60	45	25	4	14	0.76	0.008474
61	45	30	4	14	0.76	0.008474
5	5	30	4	14	0.76	0.008474
19	15	30	4	14	0.76	0.008474
37	30	15	4	14	0.76	0.008474
63	50	5	3	59	3.19	0.026782
64	50	10	3	59	3.19	0.026782
65	50	15	3	59	3.19	0.026782
66	50	20	3	59	3.19	0.026782
67	50	25	3	59	3.19	0.026782
69	50	35	3	59	3.19	0.026782
68	50	30	3	59	3.19	0.026782
35	30	5	23	370	19.99	1.287674
28	25	5	27	336	18.15	1.372712
36	30	10	21	486	26.26	1.544301
43	35	10	20	516	27.88	1.561550
42	35	5	21	498	26.90	1.582432
50	40	10	17	625	33.77	1.607701
29	25	10	25	428	23.12	1.619049
49	40	5	18	608	32.85	1.655970
22	20	10	29	394	21.29	1.728903
21	20	5	33	358	19.34	1.787612
32	25	25	23	628	33.93	2.185565
14	15	5	47	308	16.64	2.190407
30	25	15	26	561	30.31	2.207052
31	25	20	25	593	32.04	2.243216
25	20	25	24	629	33.98	2.284221
24	20	20	28	573	30.96	2.427666
16	15	15	35	469	25.34	2.483803
23	20	15	31	540	29.17	2.532980
11	10	25	26	661	35.71	2.600466
18	15	25	26	661	35.71	2.600466
17	15	20	30	592	31.98	2.687319
10	10	20	31	581	31.39	2.725299
3	5	20	32	573	30.96	2.774476
4	5	25	28	665	35.93	2.817449
15	15	10	44	446	24.10	2.969367
9	10	15	39	515	27.82	3.039122
7	10	5	75	275	14.86	3.120831
2	5	15	43	578	31.23	3.760734
8	10	10	58	435	23.50	3.817628
1	5	10	63	449	24.26	4.280192
0	5	5	124	296	15.99	5.553793

	Topic	Count	Name
0	-1	314	-1_statistics_break_smalltalk_anatomy
1	0	97	0_research_experiment_thesis_methods
2	1	67	1_personal_sharing_weekend_information
3	2	64	2_dutch_groningen_netherlands_practising
4	3	60	3_date_church_hangout_relationship
5	4	58	4_house_roommate_room_moving
6	5	56	5_christmas_shopping_market_groceries
7	6	54	6_pray_chat_buddhism_meditate
8	7	54	7_progress_app_learning_exams
9	8	48	8_travel_travelling_malta_destination
10	9	46	9_dinner_eating_cooking_making
11	10	43	10_new_relationship_receive_helping
12	11	42	11_task_assignment_work_instruction
13	12	42	12_treating_treat_tour_pts
14	13	41	13_breakfast_morning_greeting_brunch
15	14	40	14_play_games_sports_poker
16	15	39	15_interview_communication_discussion_writing
17	16	36	16_eating_dinner_meal_chatting
18	17	35	17_tutor_education_teaching_teach
19	18	35	18_presentation_propane_preparing_pokemon
20	19	31	19_academic_skills_exercises_improving
21	20	30	20_psychologist_doctors_doctor_therapeutic
22	21	29	21_park_rest_relaxing_beach
23	22	28	22_skating_yoga_gym_dance
24	23	28	23_work_working_organizing_collaboration
25	24	27	24_feedback_evaluation_assignment_internship
26	25	25	25_patient_patients_clinic_consultation
27	26	24	26_project_meeting_discussing_symposium
28	27	24	27_seminar_lectures_english_intelligence
29	28	24	28_studying_study_revise_bible
30	29	23	29_conversation_conversational_improve_polite
31	30	23	30_consultation_medical_consult_support
32	31	23	31_watching_videos_theatre_singing
33	32	21	32_dinner_communication_social_interaction
34	33	21	33_chat_chatting_just_conversing
35	34	20	34_cultural_cultures_culture_intercultural
36	35	19	35_answers_questions_answer_clarifying
37	36	17	36_lab_cleaning_clean_kitchen
38	37	17	37_learning_learn_train_training
39	38	17	38_socialising_interaction_socializing_social
40	39	17	39_party_birthday_drinking_celebrating
41	40	16	40_fun_having_enjoy_free
42	41	16	41_attending_lecture_lectures_attend
43	42	15	42_talking_politics_talk_speak
44	43	15	43_coach_shrek_meeting_group
45	44	15	44_goal_specific_particular_nope
46	45	15	45_bike_washing_repair_lamp

Supplemental Material C: Topic Modeling Core Motives during Outgroup Contacts (Approach: BERT)¶

Supplemental Material for ‘Psychological Needs During Intergroup Contact: Three Experience Sampling Studies’¶

Background¶

Set up the environment¶

Data Import¶

Data Preparation¶

Prepare model parameters¶

Extract Embedding¶

Dimension Reduction¶

Clustering¶

Topic Extraction¶

BERT Model Results¶

Interpreation¶

Follow-up Checks¶