- scikit-learn - numpy

PyScript Experiments

Fun with PyScript....

Topic Modeling with PyScript







from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import numpy as np

text_collection_element = Element("text_collection")
topics_element = Element("topics")
no_topics_element = Element("no_topics")

def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
  output = ""
  for topic_idx, topic in enumerate(H):
    output += "Topic %d:" % (topic_idx) + " "
    output += " ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])
    output += "\n"
    top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
    for doc_index in top_doc_indices:
      output += documents[doc_index]  + "\n"
    output += "\n \n"
  return output

def find_topics(*args, **kwargs):

  no_topics = int(no_topics_element.value)

  documents = text_collection_element.value.split("\n")
  documents = [x.strip() for x in documents if x.strip()]
  
  tfidf_vectorizer = TfidfVectorizer(stop_words='english')
  tfidf = tfidf_vectorizer.fit_transform(documents)
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

  nmf = NMF(n_components=no_topics,init='nndsvd').fit(tfidf)

  nmf_W = nmf.transform(tfidf)
  nmf_H = nmf.components_

  no_top_words = 4
  no_top_documents = 2
  output = display_topics(nmf_H, nmf_W, tfidf_feature_names, documents, no_top_words, no_top_documents)

  topics_element.write(output)


Back to list of Pyscript Experiments