Skip to content

Commit

Permalink
Update the tokenizer's URL_RE to include domain names ending with .co…
Browse files Browse the repository at this point in the history
…m, possibly w/ TLD, w/o www
  • Loading branch information
AngledLuffa committed Sep 20, 2024
1 parent d29896f commit 4421213
Showing 1 changed file with 1 addition and 1 deletion.
2 changes: 1 addition & 1 deletion stanza/models/tokenization/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def process_sentence(sentence, mwt_dict=None):

# https://stackoverflow.com/questions/3809401/what-is-a-good-regular-expression-to-match-a-url
# modification: disallow " as opposed to all ^\s
URL_RAW_RE = r"""(?:https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s"]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s"]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s"]{2,}|www\.[a-zA-Z0-9]+\.[^\s"]{2,})"""
URL_RAW_RE = r"""(?:https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s"]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s"]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s"]{2,}|www\.[a-zA-Z0-9]+\.[^\s"]{2,})|[a-zA-Z0-9]+\.com(?:\.[^\s"]{2,})?"""

MASK_RE = re.compile(f"(?:{EMAIL_RAW_RE}|{URL_RAW_RE})")

Expand Down

0 comments on commit 4421213

Please sign in to comment.