Merge pull request #49 from mideind/modernize

Modernization
mideind · Aug 23, 2024 · 340ecb7 · 340ecb7
2 parents 8750e9c + 7f5c92b
commit 340ecb7
Show file tree

Hide file tree

Showing 14 changed files with 365 additions and 176 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -15,10 +15,11 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest]
-        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "pypy-3.9", "pypy-3.10"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13.0-rc.1", "pypy-3.9", "pypy-3.10"]
 
     steps:
     - uses: actions/checkout@v4
+
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
       with:
@@ -29,10 +30,10 @@ jobs:
         python -m pip install --upgrade pip wheel setuptools
         python -m pip install -e ".[dev]"
 
-    - name: Type check with mypy (only on Python 3.8)
+    - name: Type check with mypy (only on oldest supported Python version)
       run: |
-        if [ "${{ matrix.python-version }}" == "3.8" ]; then python -m pip install mypy; fi
-        if [ "${{ matrix.python-version }}" == "3.8" ]; then mypy --python-version=3.8 src/tokenizer; fi
+        if [ "${{ matrix.python-version }}" == "3.9" ]; then python -m pip install mypy; fi
+        if [ "${{ matrix.python-version }}" == "3.9" ]; then mypy --python-version=3.9 src/tokenizer; fi
 
     - name: Test with pytest
       run: |

diff --git a/LICENSE.txt b/LICENSE.txt
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (C) 2023 Miðeind ehf.
+Copyright (C) 2016-2024 Miðeind ehf.
 Original author: Vilhjálmur Þorsteinsson
 
 Permission is hereby granted, free of charge, to any person obtaining a copy

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,3 +1,4 @@
 graft src
 prune src/tokenizer/__pycache__
 prune src/tokenizer/.mypy_cache
+prune src/tokenizer/.DS_Store
diff --git a/README.rst b/README.rst
@@ -12,7 +12,7 @@ Tokenization is a necessary first step in many natural language processing
 tasks, such as word counting, parsing, spell checking, corpus generation, and
 statistical analysis of text.
 
-**Tokenizer** is a compact pure-Python (>= 3.8) executable
+**Tokenizer** is a compact pure-Python (>=3.9) executable
 program and module for tokenizing Icelandic text. It converts input text to
 streams of *tokens*, where each token is a separate word, punctuation sign,
 number/amount, date, e-mail, URL/URI, etc. It also segments the token stream
@@ -809,6 +809,7 @@ can be found in the file ``test/toktest_normal_gold_expected.txt``.
 Changelog
 ---------
 
+* Version 3.4.5: Compatibility with Python 3.13. Now requires Python 3.9 or later.
 * Version 3.4.4: Better handling of abbreviations
 * Version 3.4.3: Various minor fixes. Now requires Python 3.8 or later.
 * Version 3.4.2: Abbreviations and phrases added, ``META_BEGIN`` token added.

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,33 +1,33 @@
 [project]
 name = "tokenizer"
-version = "3.4.4"
+version = "3.4.5"
 description = "A tokenizer for Icelandic text"
 authors = [{ name = "Miðeind ehf.", email = "[email protected]" }]
 readme = { file = "README.rst", content-type = "text/x-rst" }
-license = { file = "LICENSE.txt" }
-# For classifier list see: https://pypi.org/pypi?%3Aaction=list_classifiers
+license = { text = "MIT" }
 classifiers = [
     "Development Status :: 5 - Production/Stable",
     "Intended Audience :: Developers",
     "License :: OSI Approved :: MIT License",
     "Operating System :: Unix",
     "Operating System :: POSIX",
+    "Operating System :: MacOS",
     "Operating System :: Microsoft :: Windows",
     "Natural Language :: Icelandic",
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
     "Programming Language :: Python :: Implementation :: CPython",
     "Programming Language :: Python :: Implementation :: PyPy",
     "Topic :: Software Development :: Libraries :: Python Modules",
     "Topic :: Utilities",
     "Topic :: Text Processing :: Linguistic",
 ]
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 
 [project.urls]
 Repository = "https://github.com/mideind/Tokenizer"
@@ -51,17 +51,17 @@ where = ["src"]
 [tool.pytest.ini_options]
 filterwarnings = [
     # Ignore deprecation warnings in libraries, their problem not ours
-    "ignore::DeprecationWarning",
+    # "ignore::DeprecationWarning",
 ]
 
 [tool.ruff]
-line-length = 120
+line-length = 88
 
 [tool.black]
-line-length = 120
+line-length = 88
 
 [tool.isort]
 # This forces these imports to placed at the top
 known_future_library = ["__future__", "typing", "typing_extensions"]
 profile = "black"
-line_length = 120
+line_length = 88
diff --git a/src/tokenizer/__init__.py b/src/tokenizer/__init__.py
@@ -1,6 +1,6 @@
 """
 
-    Copyright(C) 2022 Miðeind ehf.
+    Copyright(C) 2016-2024 Miðeind ehf.
     Original author: Vilhjálmur Þorsteinsson
 
     This software is licensed under the MIT License:
@@ -63,9 +63,8 @@
 from .abbrev import Abbreviations, ConfigError
 
 __author__ = "Miðeind ehf."
-__copyright__ = "(C) 2023 Miðeind ehf."
-__version__ = importlib.metadata.version("tokenizer")
-
+__copyright__ = "(C) 2016-2024 Miðeind ehf."
+__version__ = importlib.metadata.version(__name__)
 
 __all__ = (
     "__author__",

diff --git a/src/tokenizer/abbrev.py b/src/tokenizer/abbrev.py
@@ -2,7 +2,7 @@
 
     Abbreviations module for tokenization of Icelandic text
 
-    Copyright (C) 2022 Miðeind ehf.
+    Copyright (C) 2016-2024 Miðeind ehf.
     Original author: Vilhjálmur Þorsteinsson
 
     This software is licensed under the MIT License:
@@ -33,35 +33,33 @@
 
 """
 
-from typing import Generic, Iterator, Optional, Set, List, Dict, TypeVar
+from typing import Generic, Iterator, Optional, TypeVar
 
 from threading import Lock
 from collections import defaultdict, OrderedDict
-from importlib.resources import open_text
+import importlib.resources as importlib_resources
 
 from .definitions import BIN_Tuple
 
 
 class ConfigError(Exception):
-
     pass
 
 
 _T = TypeVar("_T")
 
 
 class OrderedSet(Generic[_T]):
-
-    """ Shim class to provide an ordered set API on top
-        of an OrderedDict. This is necessary to make abbreviation
-        lookups predictable and repeatable, which they would not be
-        if a standard Python set() was used. """
+    """Shim class to provide an ordered set API on top
+    of an OrderedDict. This is necessary to make abbreviation
+    lookups predictable and repeatable, which they would not be
+    if a standard Python set() was used."""
 
     def __init__(self) -> None:
-        self._dict: Dict[_T, None] = OrderedDict()
+        self._dict: dict[_T, None] = OrderedDict()
 
     def add(self, item: _T) -> None:
-        """ Add an item at the end of the ordered set """
+        """Add an item at the end of the ordered set"""
         if item not in self._dict:
             self._dict[item] = None
 
@@ -73,42 +71,41 @@ def __iter__(self) -> Iterator[_T]:
 
 
 class Abbreviations:
-
-    """ Wrapper around dictionary of abbreviations,
-        initialized from the config file """
+    """Wrapper around dictionary of abbreviations,
+    initialized from the config file"""
 
     # Dictionary of abbreviations and their meanings
-    DICT: Dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
+    DICT: dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
     # Wrong versions of abbreviations
-    WRONGDICT: Dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
+    WRONGDICT: dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
     # All abbreviation meanings
-    MEANINGS: Set[str] = set()
+    MEANINGS: set[str] = set()
     # Single-word abbreviations, i.e. those with only one dot at the end
-    SINGLES: Set[str] = set()
+    SINGLES: set[str] = set()
     # Set of abbreviations without periods, e.g. "td", "osfrv"
-    WRONGSINGLES: Set[str] = set()
+    WRONGSINGLES: set[str] = set()
     # Potential sentence finishers, i.e. those with a dot at the end,
     # marked with an asterisk in the config file
-    FINISHERS: Set[str] = set()
+    FINISHERS: set[str] = set()
     # Abbreviations that should not be seen as such at the end of sentences,
     # marked with an exclamation mark in the config file
-    NOT_FINISHERS: Set[str] = set()
+    NOT_FINISHERS: set[str] = set()
     # Abbreviations that should not be seen as such at the end of sentences, but
     # are allowed in front of person names; marked with a hat ^ in the config file
-    NAME_FINISHERS: Set[str] = set()
+    NAME_FINISHERS: set[str] = set()
     # Wrong versions of abbreviations with possible corrections
     # wrong version : [correction1, correction2, ...]
-    WRONGDOTS: Dict[str, List[str]] = defaultdict(list)
+    WRONGDOTS: dict[str, list[str]] = defaultdict(list)
     # Word forms that should never be interpreted as abbreviations
-    NOT_ABBREVIATIONS: Set[str] = set()
+    NOT_ABBREVIATIONS: set[str] = set()
 
     # Ensure that only one thread initializes the abbreviations
     _lock = Lock()
 
     @staticmethod
     def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> None:
-        """ Add an abbreviation to the dictionary.
-            Called from the config file handler. """
+        """Add an abbreviation to the dictionary.
+        Called from the config file handler."""
         # Check for sentence finishers
         finisher = False
         not_finisher = False
@@ -152,7 +149,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
         # Append the abbreviation and its meaning in tuple form
         # Multiple meanings are supported for each abbreviation
         Abbreviations.DICT[abbrev].add(
-            BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, abbrev, "-",)
+            BIN_Tuple(
+                meaning,
+                0,
+                gender,
+                "skst" if fl is None else fl,
+                abbrev,
+                "-",
+            )
         )
         Abbreviations.MEANINGS.add(meaning)
         # Adding wrong versions of abbreviations
@@ -169,7 +173,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
                 # as abbreviations, even though they are listed as such
                 # in the form 'Í.' and 'Á.' for use within person names
                 Abbreviations.WRONGDICT[wabbrev].add(
-                    BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",)
+                    BIN_Tuple(
+                        meaning,
+                        0,
+                        gender,
+                        "skst" if fl is None else fl,
+                        wabbrev,
+                        "-",
+                    )
                 )
 
         elif "." in abbrev:
@@ -182,15 +193,22 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
                 wabbrev = abbrev[:i] + abbrev[i + 1 :]
                 Abbreviations.WRONGDOTS[wabbrev].append(abbrev)
                 Abbreviations.WRONGDICT[wabbrev].add(
-                    BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",)
+                    BIN_Tuple(
+                        meaning,
+                        0,
+                        gender,
+                        "skst" if fl is None else fl,
+                        wabbrev,
+                        "-",
+                    )
                 )
             if len(indices) > 2:
                 # 3 or 4 dots currently in vocabulary
                 # Not all cases with 4 dots are handled.
                 i1 = indices[0]
                 i2 = indices[1]
                 i3 = indices[2]
-                wabbrevs: List[str] = []
+                wabbrevs: list[str] = []
                 # 1 and 2 removed
                 wabbrevs.append(abbrev[:i1] + abbrev[i1 + 1 : i2] + abbrev[i2 + 1 :])
                 # 1 and 3 removed
@@ -214,7 +232,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
             Abbreviations.WRONGSINGLES.add(wabbrev)
             Abbreviations.WRONGDOTS[wabbrev].append(abbrev)
             Abbreviations.WRONGDICT[wabbrev].add(
-                BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",)
+                BIN_Tuple(
+                    meaning,
+                    0,
+                    gender,
+                    "skst" if fl is None else fl,
+                    wabbrev,
+                    "-",
+                )
             )
         if finisher:
             Abbreviations.FINISHERS.add(abbrev)
@@ -232,16 +257,16 @@ def has_abbreviation(meaning: str) -> bool:
         return meaning in Abbreviations.MEANINGS
 
     @staticmethod
-    def get_meaning(abbrev: str) -> Optional[List[BIN_Tuple]]:
-        """ Lookup meaning(s) of abbreviation, if available. """
+    def get_meaning(abbrev: str) -> Optional[list[BIN_Tuple]]:
+        """Look up meaning(s) of abbreviation, if available."""
         m = Abbreviations.DICT.get(abbrev)
         if not m:
             m = Abbreviations.WRONGDICT.get(abbrev)
         return list(m) if m else None
 
     @staticmethod
     def _handle_abbreviations(s: str) -> None:
-        """ Handle abbreviations in the settings section """
+        """Handle abbreviations in the settings section"""
         # Format: abbrev[*] = "meaning" gender (kk|kvk|hk)
         # An asterisk after an abbreviation ending with a period
         # indicates that the abbreviation may finish a sentence
@@ -272,22 +297,25 @@ def _handle_abbreviations(s: str) -> None:
 
     @staticmethod
     def _handle_not_abbreviations(s: str) -> None:
-        """ Handle not_abbreviations in the settings section """
+        """Handle not_abbreviations in the settings section"""
         if len(s) < 3 or s[0] != '"' or s[-1] != '"':
             raise ConfigError("not_abbreviations should be enclosed in double quotes")
         Abbreviations.NOT_ABBREVIATIONS.add(s[1:-1])
 
     @staticmethod
     def initialize():
-        """ Read the abbreviations config file """
+        """Read the abbreviations config file"""
         with Abbreviations._lock:
             if len(Abbreviations.DICT):
                 # Already initialized
                 return
 
             section = None
-            config = open_text(package="tokenizer", resource="Abbrev.conf", encoding="utf-8")
-            for s in config:
+
+            p = importlib_resources.files("tokenizer").joinpath("Abbrev.conf")
+            config = p.read_text(encoding="utf-8")
+
+            for s in config.split("\n"):
                 # Ignore comments
                 ix = s.find("#")
                 if ix >= 0: