Skip to content

Commit

Permalink
v0.9.1 (MaartenGr#211)
Browse files Browse the repository at this point in the history
  • Loading branch information
MaartenGr authored Sep 1, 2021
1 parent 80c9fa1 commit 0b32167
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 22 deletions.
7 changes: 0 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,6 @@ pip install bertopic[spacy]
pip install bertopic[use]
```

To install all backends:

```bash
pip install bertopic[all]
```


## Getting Started
For an in-depth overview of the features of BERTopic
you can check the full documentation [here](https://maartengr.github.io/BERTopic/) or you can follow along
Expand Down
2 changes: 1 addition & 1 deletion bertopic/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from bertopic._bertopic import BERTopic

__version__ = "0.9.0"
__version__ = "0.9.1"

__all__ = [
"BERTopic",
Expand Down
28 changes: 18 additions & 10 deletions bertopic/_bertopic.py
Original file line number Diff line number Diff line change
Expand Up @@ -856,8 +856,9 @@ def reduce_topics(self,
documents = pd.DataFrame({"Document": docs, "Topic": topics})

# Reduce number of topics
self._extract_topics(documents)
documents = self._reduce_topics(documents)
self.merged_topics = None
self._map_representative_docs()

# Extract topics and map probabilities
new_topics = documents.Topic.to_list()
Expand Down Expand Up @@ -960,6 +961,7 @@ def visualize_topics_over_time(self,
topics_over_time: pd.DataFrame,
top_n_topics: int = None,
topics: List[int] = None,
normalize_frequency: bool = False,
width: int = 1250,
height: int = 450) -> go.Figure:
""" Visualize topics over time
Expand All @@ -969,6 +971,7 @@ def visualize_topics_over_time(self,
corresponding topic representation
top_n_topics: To visualize the most frequent topics instead of all
topics: Select which topics you would like to be visualized
normalize_frequency: Whether to normalize each topic's frequency individually
width: The width of the figure.
height: The height of the figure.
Expand Down Expand Up @@ -996,13 +999,15 @@ def visualize_topics_over_time(self,
topics_over_time=topics_over_time,
top_n_topics=top_n_topics,
topics=topics,
normalize_frequency=normalize_frequency,
width=width,
height=height)

def visualize_topics_per_class(self,
topics_per_class: pd.DataFrame,
top_n_topics: int = 10,
topics: List[int] = None,
normalize_frequency: bool = False,
width: int = 1250,
height: int = 900) -> go.Figure:
""" Visualize topics per class
Expand All @@ -1012,6 +1017,7 @@ def visualize_topics_per_class(self,
corresponding topic representation
top_n_topics: To visualize the most frequent topics instead of all
topics: Select which topics you would like to be visualized
normalize_frequency: Whether to normalize each topic's frequency individually
width: The width of the figure.
height: The height of the figure.
Expand Down Expand Up @@ -1039,6 +1045,7 @@ def visualize_topics_per_class(self,
topics_per_class=topics_per_class,
top_n_topics=top_n_topics,
topics=topics,
normalize_frequency=normalize_frequency,
width=width,
height=height)

Expand Down Expand Up @@ -1491,7 +1498,7 @@ def _map_representative_docs(self):
representative_docs = self.representative_docs.copy()

# Remove topics that were merged as the most frequent
# topic or the topics they were merged into contain as they contain
# topic or the topics they were merged into as they contain
# better representative documents
if self.merged_topics:
for topic_to_remove in self.merged_topics:
Expand Down Expand Up @@ -1742,7 +1749,7 @@ def _auto_reduce_topics(self, documents: pd.DataFrame) -> pd.DataFrame:
if self.topic_embeddings is not None:
embeddings = np.array(self.topic_embeddings)
else:
embeddings = self.c_tf_idf
embeddings = self.c_tf_idf.toarray()
norm_data = normalize(embeddings, norm='l2')
predictions = hdbscan.HDBSCAN(min_cluster_size=2,
metric='euclidean',
Expand Down Expand Up @@ -1828,13 +1835,14 @@ def _map_probabilities(self, probabilities: Union[np.ndarray, None]) -> Union[np
mapped_probabilities: Updated probabilities
"""
# Map array of probabilities (probability for assigned topic per document)
if len(probabilities.shape) == 2 and self.get_topic(-1):
mapped_probabilities = np.zeros((probabilities.shape[0],
len(set(self.mapped_topics.values()))-1))
for from_topic, to_topic in self.mapped_topics.items():
if to_topic != -1 and from_topic != -1:
mapped_probabilities[:, to_topic] += probabilities[:, from_topic]
return mapped_probabilities
if probabilities is not None:
if len(probabilities.shape) == 2 and self.get_topic(-1):
mapped_probabilities = np.zeros((probabilities.shape[0],
len(set(self.mapped_topics.values()))-1))
for from_topic, to_topic in self.mapped_topics.items():
if to_topic != -1 and from_topic != -1:
mapped_probabilities[:, to_topic] += probabilities[:, from_topic]
return mapped_probabilities

return probabilities

Expand Down
2 changes: 1 addition & 1 deletion bertopic/plotting/_distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def visualize_distribution(topic_model,
<iframe src="../../tutorial/visualization/probabilities.html"
style="width:1000px; height: 500px; border: 0px;""></iframe>
"""
if len(probabilities.shape) != 2:
if len(probabilities.shape) != 1:
raise ValueError("This visualization cannot be used if you have set `calculate_probabilities` to False "
"as it uses the topic probabilities of all topics. ")
if len(probabilities[probabilities > min_probability]) == 0:
Expand Down
13 changes: 13 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
## **Version 0.9.1**
*Release date: 1 September, 2021*

A release focused on fixing several issues:

**Fixes**:

* Fix TypeError when auto-reducing topics ([#210](https://github.com/MaartenGr/BERTopic/issues/210))
* Fix mapping representative docs when reducing topics ([#208](https://github.com/MaartenGr/BERTopic/issues/208))
* Fix visualization issues with probabilities ([#205](https://github.com/MaartenGr/BERTopic/issues/205))
* Fix missing `normalize_frequency` param in plots ([#213](https://github.com/MaartenGr/BERTopic/issues/208))


## **Version 0.9**
*Release date: 9 August, 2021*

Expand Down
5 changes: 2 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
setup(
name="bertopic",
packages=find_packages(exclude=["notebooks", "docs"]),
version="0.9.0",
version="0.9.1",
author="Maarten P. Grootendorst",
author_email="[email protected]",
description="BERTopic performs topic Modeling with state-of-the-art transformer models.",
Expand Down Expand Up @@ -89,8 +89,7 @@
"flair": flair_packages,
"spacy": spacy_packages,
"use": use_packages,
"gensim": gensim_packages,
"all": extra_packages
"gensim": gensim_packages
},
python_requires='>=3.6',
)

0 comments on commit 0b32167

Please sign in to comment.