Skip to content

Commit

Permalink
Merge pull request #49 from mideind/modernize
Browse files Browse the repository at this point in the history
Modernization
  • Loading branch information
sveinbjornt committed Aug 23, 2024
2 parents 8750e9c + 7f5c92b commit 340ecb7
Show file tree
Hide file tree
Showing 14 changed files with 365 additions and 176 deletions.
9 changes: 5 additions & 4 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest]
python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12", "pypy-3.9", "pypy-3.10"]
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13.0-rc.1", "pypy-3.9", "pypy-3.10"]

steps:
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
Expand All @@ -29,10 +30,10 @@ jobs:
python -m pip install --upgrade pip wheel setuptools
python -m pip install -e ".[dev]"
- name: Type check with mypy (only on Python 3.8)
- name: Type check with mypy (only on oldest supported Python version)
run: |
if [ "${{ matrix.python-version }}" == "3.8" ]; then python -m pip install mypy; fi
if [ "${{ matrix.python-version }}" == "3.8" ]; then mypy --python-version=3.8 src/tokenizer; fi
if [ "${{ matrix.python-version }}" == "3.9" ]; then python -m pip install mypy; fi
if [ "${{ matrix.python-version }}" == "3.9" ]; then mypy --python-version=3.9 src/tokenizer; fi
- name: Test with pytest
run: |
Expand Down
2 changes: 1 addition & 1 deletion LICENSE.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (C) 2023 Miðeind ehf.
Copyright (C) 2016-2024 Miðeind ehf.
Original author: Vilhjálmur Þorsteinsson

Permission is hereby granted, free of charge, to any person obtaining a copy
Expand Down
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
graft src
prune src/tokenizer/__pycache__
prune src/tokenizer/.mypy_cache
prune src/tokenizer/.DS_Store
3 changes: 2 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Tokenization is a necessary first step in many natural language processing
tasks, such as word counting, parsing, spell checking, corpus generation, and
statistical analysis of text.

**Tokenizer** is a compact pure-Python (>= 3.8) executable
**Tokenizer** is a compact pure-Python (>=3.9) executable
program and module for tokenizing Icelandic text. It converts input text to
streams of *tokens*, where each token is a separate word, punctuation sign,
number/amount, date, e-mail, URL/URI, etc. It also segments the token stream
Expand Down Expand Up @@ -809,6 +809,7 @@ can be found in the file ``test/toktest_normal_gold_expected.txt``.
Changelog
---------

* Version 3.4.5: Compatibility with Python 3.13. Now requires Python 3.9 or later.
* Version 3.4.4: Better handling of abbreviations
* Version 3.4.3: Various minor fixes. Now requires Python 3.8 or later.
* Version 3.4.2: Abbreviations and phrases added, ``META_BEGIN`` token added.
Expand Down
18 changes: 9 additions & 9 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,33 +1,33 @@
[project]
name = "tokenizer"
version = "3.4.4"
version = "3.4.5"
description = "A tokenizer for Icelandic text"
authors = [{ name = "Miðeind ehf.", email = "[email protected]" }]
readme = { file = "README.rst", content-type = "text/x-rst" }
license = { file = "LICENSE.txt" }
# For classifier list see: https://pypi.org/pypi?%3Aaction=list_classifiers
license = { text = "MIT" }
classifiers = [
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Operating System :: Unix",
"Operating System :: POSIX",
"Operating System :: MacOS",
"Operating System :: Microsoft :: Windows",
"Natural Language :: Icelandic",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
"Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: Utilities",
"Topic :: Text Processing :: Linguistic",
]
requires-python = ">=3.8"
requires-python = ">=3.9"

[project.urls]
Repository = "https://github.com/mideind/Tokenizer"
Expand All @@ -51,17 +51,17 @@ where = ["src"]
[tool.pytest.ini_options]
filterwarnings = [
# Ignore deprecation warnings in libraries, their problem not ours
"ignore::DeprecationWarning",
# "ignore::DeprecationWarning",
]

[tool.ruff]
line-length = 120
line-length = 88

[tool.black]
line-length = 120
line-length = 88

[tool.isort]
# This forces these imports to placed at the top
known_future_library = ["__future__", "typing", "typing_extensions"]
profile = "black"
line_length = 120
line_length = 88
7 changes: 3 additions & 4 deletions src/tokenizer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
Copyright(C) 2022 Miðeind ehf.
Copyright(C) 2016-2024 Miðeind ehf.
Original author: Vilhjálmur Þorsteinsson
This software is licensed under the MIT License:
Expand Down Expand Up @@ -63,9 +63,8 @@
from .abbrev import Abbreviations, ConfigError

__author__ = "Miðeind ehf."
__copyright__ = "(C) 2023 Miðeind ehf."
__version__ = importlib.metadata.version("tokenizer")

__copyright__ = "(C) 2016-2024 Miðeind ehf."
__version__ = importlib.metadata.version(__name__)

__all__ = (
"__author__",
Expand Down
104 changes: 66 additions & 38 deletions src/tokenizer/abbrev.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Abbreviations module for tokenization of Icelandic text
Copyright (C) 2022 Miðeind ehf.
Copyright (C) 2016-2024 Miðeind ehf.
Original author: Vilhjálmur Þorsteinsson
This software is licensed under the MIT License:
Expand Down Expand Up @@ -33,35 +33,33 @@
"""

from typing import Generic, Iterator, Optional, Set, List, Dict, TypeVar
from typing import Generic, Iterator, Optional, TypeVar

from threading import Lock
from collections import defaultdict, OrderedDict
from importlib.resources import open_text
import importlib.resources as importlib_resources

from .definitions import BIN_Tuple


class ConfigError(Exception):

pass


_T = TypeVar("_T")


class OrderedSet(Generic[_T]):

""" Shim class to provide an ordered set API on top
of an OrderedDict. This is necessary to make abbreviation
lookups predictable and repeatable, which they would not be
if a standard Python set() was used. """
"""Shim class to provide an ordered set API on top
of an OrderedDict. This is necessary to make abbreviation
lookups predictable and repeatable, which they would not be
if a standard Python set() was used."""

def __init__(self) -> None:
self._dict: Dict[_T, None] = OrderedDict()
self._dict: dict[_T, None] = OrderedDict()

def add(self, item: _T) -> None:
""" Add an item at the end of the ordered set """
"""Add an item at the end of the ordered set"""
if item not in self._dict:
self._dict[item] = None

Expand All @@ -73,42 +71,41 @@ def __iter__(self) -> Iterator[_T]:


class Abbreviations:

""" Wrapper around dictionary of abbreviations,
initialized from the config file """
"""Wrapper around dictionary of abbreviations,
initialized from the config file"""

# Dictionary of abbreviations and their meanings
DICT: Dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
DICT: dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
# Wrong versions of abbreviations
WRONGDICT: Dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
WRONGDICT: dict[str, OrderedSet[BIN_Tuple]] = defaultdict(OrderedSet)
# All abbreviation meanings
MEANINGS: Set[str] = set()
MEANINGS: set[str] = set()
# Single-word abbreviations, i.e. those with only one dot at the end
SINGLES: Set[str] = set()
SINGLES: set[str] = set()
# Set of abbreviations without periods, e.g. "td", "osfrv"
WRONGSINGLES: Set[str] = set()
WRONGSINGLES: set[str] = set()
# Potential sentence finishers, i.e. those with a dot at the end,
# marked with an asterisk in the config file
FINISHERS: Set[str] = set()
FINISHERS: set[str] = set()
# Abbreviations that should not be seen as such at the end of sentences,
# marked with an exclamation mark in the config file
NOT_FINISHERS: Set[str] = set()
NOT_FINISHERS: set[str] = set()
# Abbreviations that should not be seen as such at the end of sentences, but
# are allowed in front of person names; marked with a hat ^ in the config file
NAME_FINISHERS: Set[str] = set()
NAME_FINISHERS: set[str] = set()
# Wrong versions of abbreviations with possible corrections
# wrong version : [correction1, correction2, ...]
WRONGDOTS: Dict[str, List[str]] = defaultdict(list)
WRONGDOTS: dict[str, list[str]] = defaultdict(list)
# Word forms that should never be interpreted as abbreviations
NOT_ABBREVIATIONS: Set[str] = set()
NOT_ABBREVIATIONS: set[str] = set()

# Ensure that only one thread initializes the abbreviations
_lock = Lock()

@staticmethod
def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> None:
""" Add an abbreviation to the dictionary.
Called from the config file handler. """
"""Add an abbreviation to the dictionary.
Called from the config file handler."""
# Check for sentence finishers
finisher = False
not_finisher = False
Expand Down Expand Up @@ -152,7 +149,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
# Append the abbreviation and its meaning in tuple form
# Multiple meanings are supported for each abbreviation
Abbreviations.DICT[abbrev].add(
BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, abbrev, "-",)
BIN_Tuple(
meaning,
0,
gender,
"skst" if fl is None else fl,
abbrev,
"-",
)
)
Abbreviations.MEANINGS.add(meaning)
# Adding wrong versions of abbreviations
Expand All @@ -169,7 +173,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
# as abbreviations, even though they are listed as such
# in the form 'Í.' and 'Á.' for use within person names
Abbreviations.WRONGDICT[wabbrev].add(
BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",)
BIN_Tuple(
meaning,
0,
gender,
"skst" if fl is None else fl,
wabbrev,
"-",
)
)

elif "." in abbrev:
Expand All @@ -182,15 +193,22 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
wabbrev = abbrev[:i] + abbrev[i + 1 :]
Abbreviations.WRONGDOTS[wabbrev].append(abbrev)
Abbreviations.WRONGDICT[wabbrev].add(
BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",)
BIN_Tuple(
meaning,
0,
gender,
"skst" if fl is None else fl,
wabbrev,
"-",
)
)
if len(indices) > 2:
# 3 or 4 dots currently in vocabulary
# Not all cases with 4 dots are handled.
i1 = indices[0]
i2 = indices[1]
i3 = indices[2]
wabbrevs: List[str] = []
wabbrevs: list[str] = []
# 1 and 2 removed
wabbrevs.append(abbrev[:i1] + abbrev[i1 + 1 : i2] + abbrev[i2 + 1 :])
# 1 and 3 removed
Expand All @@ -214,7 +232,14 @@ def add(abbrev: str, meaning: str, gender: str, fl: Optional[str] = None) -> Non
Abbreviations.WRONGSINGLES.add(wabbrev)
Abbreviations.WRONGDOTS[wabbrev].append(abbrev)
Abbreviations.WRONGDICT[wabbrev].add(
BIN_Tuple(meaning, 0, gender, "skst" if fl is None else fl, wabbrev, "-",)
BIN_Tuple(
meaning,
0,
gender,
"skst" if fl is None else fl,
wabbrev,
"-",
)
)
if finisher:
Abbreviations.FINISHERS.add(abbrev)
Expand All @@ -232,16 +257,16 @@ def has_abbreviation(meaning: str) -> bool:
return meaning in Abbreviations.MEANINGS

@staticmethod
def get_meaning(abbrev: str) -> Optional[List[BIN_Tuple]]:
""" Lookup meaning(s) of abbreviation, if available. """
def get_meaning(abbrev: str) -> Optional[list[BIN_Tuple]]:
"""Look up meaning(s) of abbreviation, if available."""
m = Abbreviations.DICT.get(abbrev)
if not m:
m = Abbreviations.WRONGDICT.get(abbrev)
return list(m) if m else None

@staticmethod
def _handle_abbreviations(s: str) -> None:
""" Handle abbreviations in the settings section """
"""Handle abbreviations in the settings section"""
# Format: abbrev[*] = "meaning" gender (kk|kvk|hk)
# An asterisk after an abbreviation ending with a period
# indicates that the abbreviation may finish a sentence
Expand Down Expand Up @@ -272,22 +297,25 @@ def _handle_abbreviations(s: str) -> None:

@staticmethod
def _handle_not_abbreviations(s: str) -> None:
""" Handle not_abbreviations in the settings section """
"""Handle not_abbreviations in the settings section"""
if len(s) < 3 or s[0] != '"' or s[-1] != '"':
raise ConfigError("not_abbreviations should be enclosed in double quotes")
Abbreviations.NOT_ABBREVIATIONS.add(s[1:-1])

@staticmethod
def initialize():
""" Read the abbreviations config file """
"""Read the abbreviations config file"""
with Abbreviations._lock:
if len(Abbreviations.DICT):
# Already initialized
return

section = None
config = open_text(package="tokenizer", resource="Abbrev.conf", encoding="utf-8")
for s in config:

p = importlib_resources.files("tokenizer").joinpath("Abbrev.conf")
config = p.read_text(encoding="utf-8")

for s in config.split("\n"):
# Ignore comments
ix = s.find("#")
if ix >= 0:
Expand Down
Loading

0 comments on commit 340ecb7

Please sign in to comment.