x-tabdeveloping · x-tabdeveloping · Jan 3, 2025 · Jan 3, 2025 · Jan 3, 2025 · Jan 3, 2025
diff --git a/README.md b/README.md
@@ -19,6 +19,14 @@ Pretty and opinionated topic model visualization in Python.
 
 https://github.com/x-tabdeveloping/topicwizard/assets/13087737/9736f33c-6865-4ed4-bc17-d8e6369bda80
 
+## New in version 1.1.3
+
+You can now specify your own font that should be used for wordclouds.
+This makes topicwizard usable with Chinese and other non-indo-european scripts.
+
+```python
+topicwizard.visualize(topic_data=topic_data, wordcloud_font_path="NotoSansTC-Bold.ttf")
+```
 
 ## New in version 1.1.0 🌟
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "topic-wizard"
-version = "1.1.2"
+version = "1.1.3"
 description = "Pretty and opinionated topic model visualization in Python."
 authors = ["Márton Kardos <power.up1163@gmail.com>"]
 license = "MIT"

diff --git a/topicwizard/app.py b/topicwizard/app.py
@@ -33,6 +33,7 @@ def get_dash_app(
     exclude_pages: Optional[Set[PageName]] = None,
     document_names: Optional[List[str]] = None,
     group_labels: Optional[List[str]] = None,
+    wordcloud_font_path: Optional[str] = None,
 ) -> Dash:
     """Returns topicwizard Dash application.
 
@@ -50,6 +51,9 @@ def get_dash_app(
         You can pass it along if you have genre labels for example.
         In this case an additional page will get created with information
         about how these groups relate to topics and words in the corpus.
+    wordcloud_font_path: str, default None
+        Font to use for generating wordclouds.
+        Open Sans is used by default.
 
     Returns
     -------
@@ -64,6 +68,7 @@ def get_dash_app(
         or [f"Document {i}" for i, _ in enumerate(topic_data["corpus"])],
         group_labels=group_labels,
         exclude_pages=exclude_pages,
+        wordcloud_font_path=wordcloud_font_path,
     )
     app = Dash(
         __name__,
@@ -82,7 +87,9 @@ def get_dash_app(
     return app
 
 
-def load_app(filename: str, exclude_pages: Optional[Iterable[PageName]] = None) -> Dash:
+def load_app(
+    filename: str, exclude_pages: Optional[Iterable[PageName]] = None, **kwargs
+) -> Dash:
     """Loads and prepares saved app from disk.
 
     Parameters
@@ -100,7 +107,7 @@ def load_app(filename: str, exclude_pages: Optional[Iterable[PageName]] = None)
         exclude_pages = set()
     else:
         exclude_pages = set(exclude_pages)
-    return get_dash_app(**data, exclude_pages=exclude_pages)
+    return get_dash_app(**data, exclude_pages=exclude_pages, **kwargs)
 
 
 def open_url(url: str) -> None:
@@ -156,6 +163,7 @@ def load(
     filename: str,
     exclude_pages: Optional[Iterable[PageName]] = None,
     port: int = 8050,
+    **kwargs,
 ) -> Optional[threading.Thread]:
     """Visualizes topic model data loaded from disk.
 
@@ -179,7 +187,7 @@ def load(
     """
     print("Preparing data")
     exclude_pages = set() if exclude_pages is None else set(exclude_pages)
-    app = load_app(filename, exclude_pages=exclude_pages)
+    app = load_app(filename, exclude_pages=exclude_pages, **kwargs)
     return run_app(app, port=port)
 
 
@@ -211,6 +219,7 @@ def visualize(
     exclude_pages: Optional[Iterable[PageName]] = None,
     group_labels: Optional[List[str]] = None,
     port: int = 8050,
+    wordcloud_font_path: Optional[str] = None,
     **kwargs,
 ) -> Optional[threading.Thread]:
     """Visualizes your topic model with topicwizard.
@@ -238,7 +247,9 @@ def visualize(
         You can pass it along if you have genre labels for example.
         In this case an additional page will get created with information
         about how these groups relate to topics and words in the corpus.
-
+    wordcloud_font_path: str, default None
+        Font to use for generating wordclouds.
+        Open Sans is used by default.
 
     Returns
     -------
@@ -278,5 +289,6 @@ def visualize(
         document_names=document_names,
         exclude_pages=exclude_pages,
         group_labels=group_labels,
+        wordcloud_font_path=wordcloud_font_path,
     )
     return run_app(app, port=port)
diff --git a/topicwizard/blueprints/app.py b/topicwizard/blueprints/app.py
@@ -31,6 +31,7 @@ def create_blueprint(
     word_positions: Optional[np.ndarray] = None,
     topic_positions: Optional[np.ndarray] = None,
     document_positions: Optional[np.ndarray] = None,
+    wordcloud_font_path: Optional[str] = None,
 ) -> DashBlueprint:
     # --------[ Collecting blueprints ]--------
     topic_blueprint = (
@@ -43,6 +44,7 @@ def create_blueprint(
             corpus=corpus,
             topic_names=topic_names,
             topic_positions=topic_positions,
+            wordcloud_font_path=wordcloud_font_path,
         )
         if "topics" not in exclude_pages
         else create_blank_page("topics")
@@ -59,6 +61,7 @@ def create_blueprint(
             corpus=corpus,
             topic_names=topic_names,
             document_positions=document_positions,
+            wordcloud_font_path=wordcloud_font_path,
         )
         if "documents" not in exclude_pages
         else create_blank_page("documents")
@@ -87,6 +90,7 @@ def create_blueprint(
             corpus=corpus,
             topic_names=topic_names,
             group_labels=group_labels,
+            wordcloud_font_path=wordcloud_font_path,
         )
         if group_labels is not None
         else create_blank_page("groups")

diff --git a/topicwizard/blueprints/groups.py b/topicwizard/blueprints/groups.py
@@ -1,4 +1,4 @@
-from typing import Any, List
+from typing import Any, List, Optional
 
 import dash_mantine_components as dmc
 import numpy as np
@@ -10,7 +10,8 @@
 import topicwizard.prepare.groups as prepare
 from topicwizard.components.groups.group_barplot import create_group_barplot
 from topicwizard.components.groups.group_map import create_group_map
-from topicwizard.components.groups.group_wordcloud import create_group_wordcloud
+from topicwizard.components.groups.group_wordcloud import \
+    create_group_wordcloud
 from topicwizard.help.utils import make_helper
 
 
@@ -20,6 +21,7 @@ def create_blueprint(
     document_topic_matrix: np.ndarray,
     topic_term_matrix: np.ndarray,
     group_labels: List[str],
+    wordcloud_font_path: Optional[str] = None,
     **kwargs,
 ) -> DashBlueprint:
     # --------[ Preparing data ]--------
@@ -47,7 +49,9 @@ def create_blueprint(
     group_map = create_group_map(
         group_positions, group_importances, group_names, dominant_topics, topic_colors
     )
-    group_wordcloud = create_group_wordcloud(group_term_importances, vocab)
+    group_wordcloud = create_group_wordcloud(
+        group_term_importances, vocab, wordcloud_font_path=wordcloud_font_path
+    )
     group_barchart = create_group_barplot(group_topic_importances, topic_colors)
     blueprints = [
         group_map,

diff --git a/topicwizard/blueprints/topics.py b/topicwizard/blueprints/topics.py
@@ -50,6 +50,7 @@ def create_blueprint(
     topic_term_matrix: np.ndarray,
     topic_names: List[str],
     topic_positions: Optional[np.ndarray] = None,
+    wordcloud_font_path: Optional[str] = None,
     **kwargs,
 ) -> DashBlueprint:
     # --------[ Preparing data ]--------
@@ -61,7 +62,9 @@ def create_blueprint(
         topic_positions, topic_importances, topic_names
     )
     topic_barplot = create_topic_barplot(topic_term_matrix, vocab)
-    wordcloud = create_wordcloud(topic_term_matrix, vocab)
+    wordcloud = create_wordcloud(
+        topic_term_matrix, vocab, wordcloud_font_path=wordcloud_font_path
+    )
     blueprints = [
         intertopic_map,
         topic_switcher,

diff --git a/topicwizard/components/groups/group_wordcloud.py b/topicwizard/components/groups/group_wordcloud.py
@@ -9,7 +9,7 @@
 
 
 def create_group_wordcloud(
-    group_term_importances: np.ndarray, vocab: np.ndarray
+    group_term_importances: np.ndarray, vocab: np.ndarray, wordcloud_font_path=None
 ) -> DashBlueprint:
     group_wordcloud = DashBlueprint()
 
@@ -25,6 +25,6 @@ def create_group_wordcloud(
     )
     def update_plot(selected_group: int) -> go.Figure:
         top_words = prepare.top_words(selected_group, 60, group_term_importances, vocab)
-        return plots.wordcloud(top_words)
+        return plots.wordcloud(top_words, custom_font_path=wordcloud_font_path)
 
     return group_wordcloud
diff --git a/topicwizard/components/topics/wordcloud.py b/topicwizard/components/topics/wordcloud.py
@@ -7,7 +7,7 @@
 import topicwizard.prepare.topics as prepare
 
 
-def create_wordcloud(topic_term_matrix, vocab):
+def create_wordcloud(topic_term_matrix, vocab, wordcloud_font_path=None):
     wordcloud = DashBlueprint()
     top_bar = prepare.calculate_top_words(
         topic_id=0,
@@ -32,6 +32,6 @@ def update(current_topic: int) -> go.Figure:
             components=topic_term_matrix,
             vocab=vocab,
         )
-        return plots.wordcloud(top_bar)
+        return plots.wordcloud(top_bar, custom_font_path=wordcloud_font_path)
 
     return wordcloud
diff --git a/topicwizard/figures/groups.py b/topicwizard/figures/groups.py
@@ -163,7 +163,12 @@ def group_topic_barcharts(
 
 
 def group_wordclouds(
-    topic_data: TopicData, group_labels: List[str], top_n: int = 30, n_columns: int = 4
+    topic_data: TopicData,
+    group_labels: List[str],
+    top_n: int = 30,
+    n_columns: int = 4,
+    custom_font_path: str = None,
+    color_scheme: str = "twilight",
 ) -> go.Figure:
     """Plots wordclouds for each group.
 
@@ -177,6 +182,10 @@ def group_wordclouds(
         Number of words to display for each group.
     n_columns: int, default 4
         Number of columns the faceted plot should have.
+    custom_font_path: str, default None
+        Path to custom font to use to render the wordcloud.
+    color_scheme: str, default 'twilight'
+        Matplotlib color scheme to use for the plot.
     """
     # Factorizing group labels
     group_id_labels, group_names = pd.factorize(group_labels)
@@ -203,7 +212,9 @@ def group_wordclouds(
         top_words = prepare.top_words(
             group_id, top_n, group_term_importances, topic_data["vocab"]
         )
-        subfig = plots.wordcloud(top_words)
+        subfig = plots.wordcloud(
+            top_words, color_scheme=color_scheme, custom_font_path=custom_font_path
+        )
         row, column = (group_id // n_columns) + 1, (group_id % n_columns) + 1
         fig.add_trace(subfig.data[0], row=row, col=column)
     fig.update_layout(

diff --git a/topicwizard/figures/topics.py b/topicwizard/figures/topics.py
@@ -111,6 +111,7 @@ def topic_wordclouds(
     top_n: int = 30,
     n_columns: int = 4,
     color_scheme: str = "copper",
+    custom_font_path=None,
 ) -> go.Figure:
     """Plots most relevant words as word clouds for every topic.
 
@@ -124,6 +125,8 @@ def topic_wordclouds(
         Number of columns in the subplot grid.
     color_scheme: str, default 'copper'
         Matplotlib color scheme to use for the wordcloud.
+    custom_font_path: str, default None
+        Path to custom font to use to render the wordcloud.
     """
     n_topics = topic_data["topic_term_matrix"].shape[0]
     (
@@ -150,7 +153,9 @@ def topic_wordclouds(
             components=topic_term_importances,
             vocab=topic_data["vocab"],
         )
-        subfig = plots.wordcloud(top_words, color_scheme=color_scheme)
+        subfig = plots.wordcloud(
+            top_words, color_scheme=color_scheme, custom_font_path=custom_font_path
+        )
         row, column = (topic_id // n_columns) + 1, (topic_id % n_columns) + 1
         fig.add_trace(subfig.data[0], row=row, col=column)
     fig.update_layout(

diff --git a/topicwizard/plots/documents.py b/topicwizard/plots/documents.py
@@ -1,4 +1,5 @@
 """Module containing plotting utilities for documents."""
+
 from typing import Dict, Iterable, List, Optional
 
 import numpy as np
@@ -9,6 +10,8 @@
 from PIL import Image
 from wordcloud import WordCloud
 
+from topicwizard.plots.utils import get_default_font_path
+
 
 def document_map(
     x: np.ndarray,
@@ -209,11 +212,16 @@ def document_timeline(
 
 
 def document_wordcloud(
-    doc_id: int, document_term_matrix: np.ndarray, vocab: np.ndarray
+    doc_id: int,
+    document_term_matrix: np.ndarray,
+    vocab: np.ndarray,
+    custom_font_path=None,
 ) -> go.Figure:
     coo = spr.coo_array(document_term_matrix[[doc_id], :])
     term_dict = {vocab[column]: data for column, data in zip(coo.col, coo.data)}
+    font_path = custom_font_path or get_default_font_path().absolute()
     cloud = WordCloud(
+        font_path=font_path,
         width=800,
         height=800,
         background_color="white",

diff --git a/topicwizard/plots/groups.py b/topicwizard/plots/groups.py
@@ -5,6 +5,8 @@
 from PIL import Image
 from wordcloud import WordCloud
 
+from topicwizard.plots.utils import get_default_font_path
+
 
 def group_map(
     x: np.ndarray,
@@ -133,17 +135,21 @@ def group_topics_barchart(top_topics: pd.DataFrame, topic_colors: np.ndarray):
     return fig
 
 
-def wordcloud(top_words: pd.DataFrame) -> go.Figure:
+def wordcloud(
+    top_words: pd.DataFrame, custom_font_path=None, color_scheme: str = "twilight"
+) -> go.Figure:
     """Plots most relevant words for current topic as a worcloud."""
     top_dict = {
         word: importance
         for word, importance in zip(top_words.word, top_words.importance)
     }
+    font_path = custom_font_path or get_default_font_path().absolute()
     cloud = WordCloud(
+        font_path=font_path,
         width=800,
         height=1060,
         background_color="white",
-        colormap="twilight",
+        colormap=color_scheme,
         scale=4,
     ).generate_from_frequencies(top_dict)
     image = cloud.to_image()

diff --git a/topicwizard/plots/topics.py b/topicwizard/plots/topics.py
@@ -10,6 +10,8 @@
 from sklearn.preprocessing import minmax_scale
 from wordcloud import WordCloud
 
+from topicwizard.plots.utils import get_default_font_path
+
 
 def intertopic_map(
     x: np.ndarray,
@@ -140,15 +142,19 @@ def topic_plot(top_words: pd.DataFrame):
     return fig
 
 
-def wordcloud(top_words: pd.DataFrame, color_scheme: str = "copper") -> go.Figure:
+def wordcloud(
+    top_words: pd.DataFrame, color_scheme: str = "copper", custom_font_path=None
+) -> go.Figure:
     """Plots most relevant words for current topic as a worcloud."""
     top_dict = {
         word: importance
         for word, importance in zip(
             top_words.word, 0.1 + minmax_scale(top_words.importance)
         )
     }
+    font_path = custom_font_path or get_default_font_path().absolute()
     cloud = WordCloud(
+        font_path=font_path,
         width=800,
         height=1060,
         background_color="white",