Refactor the slugify function

ozhyrenkov · May 3, 2021 · 00ab0e1 · 00ab0e1
1 parent 74dd46d
commit 00ab0e1
Show file tree

Hide file tree

Showing 4 changed files with 67 additions and 17 deletions.
diff --git a/normality/__init__.py b/normality/__init__.py
@@ -14,6 +14,7 @@
 from normality.encoding import DEFAULT_ENCODING
 from normality.stringify import stringify
 from normality.paths import safe_filename
+from normality.slugify import slugify
 from normality.util import Categories, Encoding
 
 __all__ = [
@@ -78,24 +79,10 @@ def normalize(
     # Perform unicode category-based character replacement. This is
     # used to filter out whole classes of characters, such as symbols,
     # punctuation, or whitespace-like characters.
-    text = category_replace(text, replace_categories)
-
-    if text is None:
-        return None
+    if replace_categories is not None:
+        text = category_replace(text, replace_categories)
 
     if collapse:
         # Remove consecutive whitespace.
         text = collapse_spaces(text)
     return text
-
-
-def slugify(value: Any, sep: str = "-") -> Optional[str]:
-    """A simple slug generator."""
-    text = stringify(value)
-    if text is None:
-        return None
-    text = text.replace(sep, WS)
-    text = normalize(text, ascii=True)
-    if text is None:
-        return None
-    return text.replace(WS, sep)
diff --git a/normality/constants.py b/normality/constants.py
@@ -30,7 +30,7 @@
     "Zs": WS,
     "Zl": WS,
     "Zp": WS,
-    "Pc": None,
+    "Pc": WS,  # TODO: figure out if this wants to be None
     "Pd": WS,
     "Ps": WS,
     "Pe": WS,
@@ -46,4 +46,35 @@
     "Zp": WS,
 }
 
+SLUG_CATEGORIES: Categories = {
+    "Cc": None,
+    "Cf": None,
+    "Cs": None,
+    "Co": None,
+    "Cn": None,
+    # "Lm": None,
+    # "Mn": None,
+    "Mc": WS,
+    "Me": None,
+    "No": None,
+    "Zs": WS,
+    "Zl": WS,
+    "Zp": WS,
+    "Pc": WS,
+    "Pd": WS,
+    "Ps": WS,
+    "Pe": WS,
+    "Pi": WS,
+    "Pf": WS,
+    "Po": WS,
+    "Sm": WS,
+    "Sc": None,
+    "Sk": None,
+    "So": WS,
+    "Zs": WS,
+    "Zl": WS,
+    "Zp": WS,
+}
+
+
 CONTROL_CODES: Categories = {"Cc": WS, "Cf": WS, "Cs": WS, "Co": WS, "Cn": WS}
diff --git a/normality/slugify.py b/normality/slugify.py
@@ -0,0 +1,29 @@
+import string
+from typing import Any, Optional
+
+from normality.cleaning import collapse_spaces, category_replace
+from normality.constants import SLUG_CATEGORIES, WS
+from normality.transliteration import latinize_text
+from normality.stringify import stringify
+
+VALID_CHARS = string.ascii_lowercase + string.digits + WS
+
+
+def slugify(value: Any, sep: str = "-") -> Optional[str]:
+    """A simple slug generator. Slugs are pure ASCII lowercase strings
+    that can be used in URLs an other places where a name has to be
+    machine-safe."""
+    text = stringify(value)
+    if text is None:
+        return None
+    text = text.replace(sep, WS)
+    # run this first because it'll give better results on special
+    # characters.
+    text = category_replace(text, SLUG_CATEGORIES)
+    text = latinize_text(text, ascii=True)
+    text = text.lower()
+    text = "".join([c for c in text if c in VALID_CHARS])
+    text = collapse_spaces(text)
+    if text is None:
+        return None
+    return text.replace(WS, sep)
diff --git a/tests/test_normality.py b/tests/test_normality.py
@@ -32,6 +32,8 @@ def test_azeri(self):
     def test_slugify(self):
         text = u"BABY! camel-is good"
         self.assertEqual("baby-camel-is-good", slugify(text, sep="-"))
+        self.assertEqual("tests", slugify("testʼs", sep="-"))
+        self.assertEqual("test-s", slugify("test_s", sep="-"))
 
     def test_georgian(self):
         text = u"ავლაბრის ფონდი"
@@ -40,6 +42,7 @@ def test_georgian(self):
     def test_german(self):
         text = u"Häschen Spaß"
         self.assertEqual("Haschen Spass", ascii_text(text))
+        self.assertEqual("haschen-spass", slugify(text, sep="-"))
 
     def test_stringify(self):
         self.assertEqual(".", stringify(" . "))