PyScript Experiments
Fun with PyScript....
Topic Modeling with PyScript
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import numpy as np
text_collection_element = Element("text_collection")
topics_element = Element("topics")
no_topics_element = Element("no_topics")
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
output = ""
for topic_idx, topic in enumerate(H):
output += "Topic %d:" % (topic_idx) + " "
output += " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
output += "\n"
top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
for doc_index in top_doc_indices:
output += documents[doc_index] + "\n"
output += "\n \n"
return output
def find_topics(*args, **kwargs):
no_topics = int(no_topics_element.value)
documents = text_collection_element.value.split("\n")
documents = [x.strip() for x in documents if x.strip()]
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
nmf = NMF(n_components=no_topics,init='nndsvd').fit(tfidf)
nmf_W = nmf.transform(tfidf)
nmf_H = nmf.components_
no_top_words = 4
no_top_documents = 2
output = display_topics(nmf_H, nmf_W, tfidf_feature_names, documents, no_top_words, no_top_documents)
topics_element.write(output)
Back to list of Pyscript Experiments